PDFsharp - moved to http://forum.pdfsharp.net/ Forum Index PDFsharp - moved to http://forum.pdfsharp.net/
Please visit the new PDFsharp forum at http://forum.pdfsharp.net/
 
 FAQFAQ   SearchSearch   MemberlistMemberlist   UsergroupsUsergroups   RegisterRegister 
 ProfileProfile   Log in to check your private messagesLog in to check your private messages   Log inLog in 

How to Extract Unicode Text From PDF Files?

 
This forum is locked: you cannot post, reply to, or edit topics.   This topic is locked: you cannot edit posts or make replies.    PDFsharp - moved to http://forum.pdfsharp.net/ Forum Index -> Support - moved to http://forum.pdfsharp.net/
View previous topic :: View next topic  
Author Message
mohammad



Joined: 04 Nov 2008
Posts: 1

PostPosted: Tue Nov 04, 2008 11:08 am    Post subject: How to Extract Unicode Text From PDF Files? Reply with quote

Dear All,
Is there any way to use PDFSharp or MigraDoc to extract text (Unicode and ASCII) from pdf files?
Back to top
View user's profile Send private message
marihanzo



Joined: 17 Mar 2009
Posts: 2

PostPosted: Tue Mar 17, 2009 4:23 pm    Post subject: Extract text from pdf Reply with quote

After a long search over the internet I couldn't find anything related to this problem.

So I decided to implement my own function to extract text from pdf files in C#.
Starting from a procedure found into samples of iText library, I coded this class that does what I need.

Infortunately I can't guarantee that text extracted is always the whole content of pdf file, but during my tests fortunately it was Very Happy .

I hope this will help, the class code follows.

PDFParser class
Code:

    public class PDFParser
    {
        /// BT = Beginning of a text object operator
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

     

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what.
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }
                return resultString;
            }
            catch
            {
                return "";
            }
        }
        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="search">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if (token.Length > 1)
                {
                    if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                        (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                        ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                        (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                        (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                        ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                        (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                        (recent[_numberOfCharsToKeep - 4] == 0x0a))
                        )
                    {
                        return true;
                    }
                }
                else
                {
                    return false;
                }

            }
            return false;
        }
        #endregion
    }



Application code
Code:

       public override String ExtractText()
        {
            String outputText = "";
            try
            {
                PdfDocument inputDocument = PdfReader.Open(this._sDirectory + this._sFileName, PdfDocumentOpenMode.ReadOnly);

                foreach (PdfPage page in inputDocument.Pages)
                {
                    for (int index = 0; index < page.Contents.Elements.Count; index++)
                    {

                        PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
                        outputText += new PDFParser().ExtractTextFromPDFBytes(stream.Value);
                    }
                }

            }
            catch (Exception e)
            {
                PDF_ParseException oEx = new PDF_ParseException(this, e);
                oEx.Log();
                oEx.ToPdf(this._sDirectoryException);
            }
            return outputText;
        }


Enjoy it! Cool
Back to top
View user's profile Send private message
Display posts from previous:   
This forum is locked: you cannot post, reply to, or edit topics.   This topic is locked: you cannot edit posts or make replies.    PDFsharp - moved to http://forum.pdfsharp.net/ Forum Index -> Support - moved to http://forum.pdfsharp.net/ All times are GMT
Page 1 of 1

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group. Hosted by phpBB.BizHat.com


Start Your Own YouTube Clone

Free Web Hosting | Free Forum Hosting | FlashWebHost.com | Image Hosting | Photo Gallery | FreeMarriage.com

Powered by PhpBBweb.com, setup your forum now!
For Support, visit Forums.BizHat.com