Joseph Michael Pesch
VP Programming

Extract PDF File(s) from Adobe PDF Portfolio File Using iTextSharp Open Source PDF Library

by 28. August 2016 07:23

Using iTextSharp open source PDF library, the below console application illustrates opening one or more PDF files (based on file path and file mask inputs) and extracting a desired single PDF file from each.

using iTextSharp.text.pdf;
using System;
using System.Collections.Generic;
using System.IO;

namespace PdfPortfolioSample
{
  class Program
  {
    static void Main(string[] args)
    {
      Console.Write("Enter source path: ");
      string sourcePath = Console.ReadLine();
      Console.Write("Enter file mask (e.g. *.pdf): ");
      string fileMask = Console.ReadLine();
      Console.Write("Recursive (y/n): ");
      bool recursive = Console.ReadLine().ToUpper() == "Y";
      Console.Write("Enter target path: ");
      string targetPath = Console.ReadLine();
      Console.Write("Enter document name to extract (e.g. MLPA.PDF): ");
      string docName = Console.ReadLine();
      List<string> files = GetFiles(sourcePath, fileMask, recursive);
      foreach (string file in files)
      {
        GetPdfFromPortfolio(file, targetPath, docName);
      }
    }
    private static void GetPdfFromPortfolio(string filePath, string targetPath, string docName)
    {
      PdfReader reader = new PdfReader(filePath);
      PdfDictionary root = reader.Catalog;
      PdfDictionary documentnames = root.GetAsDict(PdfName.NAMES);
      PdfDictionary embeddedfiles =
          documentnames.GetAsDict(PdfName.EMBEDDEDFILES);
      PdfArray filespecs = embeddedfiles.GetAsArray(PdfName.NAMES);
      for (int i = 0; i < filespecs.Size;)
      {
        filespecs.GetAsString(i++);
        PdfDictionary filespec = filespecs.GetAsDict(i++);
        PdfDictionary refs = filespec.GetAsDict(PdfName.EF);
        foreach (PdfName key in refs.Keys)
        {
          PRStream stream = (PRStream)PdfReader.GetPdfObject(
            refs.GetAsIndirectObject(key)
          );

          if (filespec.GetAsString(key).ToString().ToUpper() == docName.ToUpper())
            using (FileStream fs = new FileStream(
              targetPath + @"\" + Path.GetFileName(filePath).Substring(0, 10) + filespec.GetAsString(key).ToString(), FileMode.OpenOrCreate
            ))
            {
              byte[] attachment = PdfReader.GetStreamBytes(stream);
              fs.Write(attachment, 0, attachment.Length);
            }
        }
      }

    }
    private static List<string> GetFiles(string path, string fileMask = "", bool recursive = false, List<string> files = null)
    {
      if (files == null) files = new List<string>();
      try
      {
        foreach (var file in Directory.GetFiles(path, fileMask))
        {
          files.Add(file);
        }
        if (recursive)
          foreach (string dir in Directory.GetDirectories(path))
          {
            GetFiles(dir, fileMask, recursive, files);
          }
      }
      catch (System.Exception e)
      {
        Console.WriteLine(e.ToString());
      }
      return files;
    }
  }
}

Tags:

C# | iTextSharp | PDF