/** * @Author Eric Jensen * Date: July, 2000 * Copyright (C) 2000 Eric Jensen * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * This is a wrapper for the Pj PDF parser... It is my first use of Pj * and I never had time to clean it up, so don't take it too seriously. :) * */ import java.io.*; import java.util.*; import com.etymon.pj.*; import com.etymon.pj.object.*; import com.etymon.pj.exception.*; /** * This is a wrapper for the Pj PDF parser */ public class PjWrapper { Pdf pdf; PjCatalog catalog; PjPagesNode rootPage; public PjWrapper(String filename) throws IOException, PjException { pdf = new Pdf(filename); // hopefully the catalog can never be a reference... catalog = (PjCatalog) pdf.getObject(pdf.getCatalog()); // root node of pages tree is specified by a reference // in the catalog rootPage = (PjPagesNode) pdf.resolve(catalog.getPages()); } public static void main (String [] args) throws IOException, PjException { PjWrapper testWrapper = new PjWrapper(args[0]); LinkedList textList = testWrapper.getAllText(); } /** * Returns as much text as we can extract from the PDF. * This currently includes: * * NOTE: Pj does not support LZW, so some text in some PDF's may not * be indexable */ public LinkedList getAllText() throws PjException { LinkedList stringList = new LinkedList(); Iterator streamIter = getAllContentsStreams().iterator(); PjStream stream; String streamData; String streamText; boolean moreData; int textStart, textEnd; System.out.println("Going through streams..."); while(streamIter.hasNext()) { System.out.println("Getting next stream"); stream = (PjStream) streamIter.next(); System.out.println("Adding text from stream with filter: " + getFilterString(stream)); stream = stream.flateDecompress(); System.out.println("Adding text from stream with filter after decompress: " + getFilterString(stream)); streamData = new String(stream.getBuffer()); streamText = new String(); moreData = true; textStart = textEnd = 0; while(moreData) { if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) { moreData = false; break; } if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) { moreData = false; break; } try { streamText += PjString.decodePdf(streamData.substring(textStart, textEnd + 1)); } catch (Exception e) { System.out.println("malformed string: " + streamData.substring(textStart, textEnd + 1)); } } System.out.println("Text from stream is: " + streamText); if (streamText.length() > 0) stringList.add(streamText); } return stringList; } public static String getFilterString(PjStream stream) throws PjException { String filterString = new String(); PjObject filter; System.out.println("getting filter from dictionary"); if ((filter = stream.getStreamDictionary().getFilter()) == null) { System.out.println("Got null filter"); return ""; } System.out.println("got it"); // filter should either be a name or an array of names if (filter instanceof PjName) { System.out.println("getting filter string from simple name"); filterString = ((PjName) filter).getString(); } else { System.out.println("getting filter string from array of names"); Iterator nameIter; Vector nameVector; if ((nameVector = ((PjArray) filter).getVector()) == null) { System.out.println("got null vector for list of names"); return ""; } nameIter = nameVector.iterator(); while (nameIter.hasNext()) { filterString += ((PjName) nameIter.next()).getString(); if (nameIter.hasNext()) filterString += " "; } } System.out.println("got filter string"); return filterString; } /** * Performs a post-order traversal of the pages tree * from the root node and gets all of the contents streams * @returns a list of all the contents of all the pages */ public LinkedList getAllContentsStreams() throws InvalidPdfObjectException { return getContentsStreams(getAllPages()); } /** * Get contents streams from the list of PjPage objects * @returns a list of all the contents of the pages */ public LinkedList getContentsStreams(LinkedList pages) throws InvalidPdfObjectException { LinkedList streams = new LinkedList(); Iterator pageIter = pages.iterator(); PjObject contents; while(pageIter.hasNext()) { contents = pdf.resolve(((PjPage)pageIter.next()).getContents()); // should only be a stream or an array of streams (or refs to streams) if (contents instanceof PjStream) streams.add(contents); else { Iterator streamsIter = ((PjArray)contents).getVector().iterator(); while(streamsIter.hasNext()) streams.add(pdf.resolve((PjObject)streamsIter.next())); } } return streams; } /** * Performs a post-order traversal of the pages tree * from the root node. * @returns a list of all the PjPage objects */ public LinkedList getAllPages() throws InvalidPdfObjectException { LinkedList pages = new LinkedList(); getPages(rootPage, pages); return pages; } /** * Performs a post-order traversal of the pages tree * from the node passed to it. * @returns a list of all the PjPage objects under node */ public void getPages(PjObject node, LinkedList pages) throws InvalidPdfObjectException { PjPagesNode pageNode = null; // let's hope pdf's don't have pointers to pointers if (node instanceof PjReference) pageNode = (PjPagesNode) pdf.resolve(node); else pageNode = (PjPagesNode) node; if (pageNode instanceof PjPage) { pages.add(pageNode); return; } // kids better be an array and not a reference to one Iterator kidIterator = ((PjArray) ((PjPages) pageNode).getKids()).getVector().iterator(); while(kidIterator.hasNext()) { getPages((PjObject) kidIterator.next(), pages); } } public Pdf getPdf() { return pdf; } }