Hi Arild,
Please try to apply the following code on your file:
var desktop = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
var fileName = "your-file-name.pdf";
var str = new string(Array.ConvertAll(File.ReadAllBytes(Path.Combine(desktop, fileName)), b => (char)b));
if (str.Contains("/XRef") || str.Contains("/ObjStm"))
{
Console.WriteLine("PDF file probably contains cross-reference stream or object stream and cannot be processed.");
return;
}
// Strip all streams.
str = Regex.Replace(str, @"(?<start>\bstream(\r\n|\n))(?<data>(.|\n)*?)(?<end>(\r\n|\r|\n)endstream\b)", m => m.Groups["start"].Value + new string(' ', m.Groups["data"].Length) + m.Groups["end"].Value);
// Strip all literal strings.
bool hasBalancedParentheses;
do
{
hasBalancedParentheses = false;
str = Regex.Replace(str, @"\((.|\n)*?\)", m =>
{
var lastIndex = m.Value.LastIndexOf('(');
if (lastIndex > 0 && str[lastIndex - 1] != '\\')
{
hasBalancedParentheses = true;
return '(' + new string(' ', m.Length - 1);
}
return '(' + new string(' ', m.Length - 2) + ')';
});
}
while (hasBalancedParentheses);
// Strip all hexadecimal strings.
str = Regex.Replace(str, @"((?<!<)<(?!<))[^>]*?>", m => '<' + new string('0', m.Length - 2) + '>');
File.WriteAllBytes(Path.Combine(desktop, Path.GetFileNameWithoutExtension(fileName) + "-stripped" + Path.GetExtension(fileName)), Array.ConvertAll(str.ToCharArray(), c => (byte)c));
If your file doesn’t contain cross-reference streams (/XRef) nor object streams( /ObjStm), this code should be able to replace the content of all streams and literal strings with spaces and content of all hexadecimal strings with zeros without making the cross-reference table invalid.
Changing other objects is not required because PDF encryption encrypts only strings and streams.
Regards,
Stipo