Extract text from a PDF (with Apache Tika) Tag(s): IO OpenSource


You can do it with Apache Tika
import java.io.FileInputStream;
import java.io.InputStream;

import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

public class ParsePDFWithTika {
  public static void main(String args[]) throws Exception {

    InputStream is = null;
    try {
      is = new FileInputStream("C:/Temp/realhowto-vbs-20121221.pdf");
      ContentHandler contenthandler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      PDFParser pdfparser = new PDFParser();
      pdfparser.parse(is, contenthandler, metadata, new ParseContext());
      System.out.println(contenthandler.toString());
    }
    catch (Exception e) {
      e.printStackTrace();
    }
    finally {
        if (is != null) is.close();
    }
  }
}
See also Extract text from HTML, Get Mime Type from a file.
blog comments powered by Disqus