public class PDFMarkedContent2XHTML
extends org.apache.pdfbox.text.PDFTextStripper
This was added in Tika 1.24 as an alpha version of a text extractor that builds the text from the marked text tree and includes/normalizes some of the structural tags.
| Modifier and Type | Field and Description |
|---|---|
static String |
XMP_DOCUMENT_CATALOG_LOCATION |
static String |
XMP_PAGE_LOCATION_PREFIX |
| Modifier and Type | Method and Description |
|---|---|
protected float |
computeFontHeight(org.apache.pdfbox.pdmodel.font.PDFont arg0) |
protected void |
endDocument(org.apache.pdfbox.pdmodel.PDDocument pdf) |
protected void |
endPage(org.apache.pdfbox.pdmodel.PDPage page) |
int |
getCurrentPageNo()
we need to override this because we are overriding
processPages(PDPageTree) |
int |
getStartPage() |
static void |
process(org.apache.pdfbox.pdmodel.PDDocument pdDocument,
ContentHandler handler,
ParseContext context,
Metadata metadata,
PDFParserConfig config)
Converts the given PDF document (and related metadata) to a stream
of XHTML SAX events sent to the given content handler.
|
void |
processPage(org.apache.pdfbox.pdmodel.PDPage page) |
protected void |
processPages(org.apache.pdfbox.pdmodel.PDPageTree pages)
See TIKA-2845 for why we need to override this.
|
void |
setEndBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem) |
void |
setStartBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem) |
void |
setStartPage(int startPage) |
protected void |
showGlyph(org.apache.pdfbox.util.Matrix textRenderingMatrix,
org.apache.pdfbox.pdmodel.font.PDFont font,
int code,
String unicode,
org.apache.pdfbox.util.Vector displacement) |
protected void |
startDocument(org.apache.pdfbox.pdmodel.PDDocument pdf) |
protected void |
startPage(org.apache.pdfbox.pdmodel.PDPage page) |
protected void |
writeCharacters(org.apache.pdfbox.text.TextPosition text) |
protected void |
writeLineSeparator() |
protected void |
writeParagraphEnd() |
protected void |
writeParagraphStart() |
protected void |
writeString(String text) |
protected void |
writeWordSeparator() |
endArticle, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCharactersByArticle, getDropThreshold, getEndBookmark, getEndPage, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getSuppressDuplicateOverlappingText, getText, getWordSeparator, matchPattern, processTextPosition, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndPage, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageStart, setParagraphEnd, setParagraphStart, setShouldSeparateByBeads, setSortByPosition, setSpacingTolerance, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startArticle, writePage, writePageEnd, writePageStart, writeParagraphSeparator, writeString, writeTextaddOperator, applyTextAdjustment, beginMarkedContentSequence, beginText, decreaseLevel, endMarkedContentSequence, endText, getAppearance, getCurrentPage, getGraphicsStackSize, getGraphicsState, getInitialMatrix, getLevel, getResources, getTextLineMatrix, getTextMatrix, increaseLevel, operatorException, processAnnotation, processChildStream, processOperator, processOperator, processSoftMask, processTilingPattern, processTilingPattern, processTransparencyGroup, processType3Stream, registerOperatorProcessor, restoreGraphicsStack, restoreGraphicsState, saveGraphicsStack, saveGraphicsState, setLineDashPattern, setTextLineMatrix, setTextMatrix, showAnnotation, showFontGlyph, showFontGlyph, showForm, showGlyph, showText, showTextString, showTextStrings, showTransparencyGroup, showType3Glyph, showType3Glyph, transformedPoint, transformWidth, unsupportedOperatorpublic static final String XMP_DOCUMENT_CATALOG_LOCATION
public static final String XMP_PAGE_LOCATION_PREFIX
public static void process(org.apache.pdfbox.pdmodel.PDDocument pdDocument,
ContentHandler handler,
ParseContext context,
Metadata metadata,
PDFParserConfig config)
throws SAXException,
TikaException
pdDocument - PDF documenthandler - SAX content handlermetadata - PDF metadataSAXException - if the content handler fails to process SAX eventsTikaException - if there was an exception outside of per page processingprotected void processPages(org.apache.pdfbox.pdmodel.PDPageTree pages)
throws IOException
IOExceptionpublic void processPage(org.apache.pdfbox.pdmodel.PDPage page)
throws IOException
processPage in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void endPage(org.apache.pdfbox.pdmodel.PDPage page)
throws IOException
IOExceptionprotected void writeParagraphStart()
throws IOException
writeParagraphStart in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void writeParagraphEnd()
throws IOException
writeParagraphEnd in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void writeString(String text) throws IOException
writeString in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void writeCharacters(org.apache.pdfbox.text.TextPosition text)
throws IOException
writeCharacters in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void writeWordSeparator()
throws IOException
writeWordSeparator in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void writeLineSeparator()
throws IOException
writeLineSeparator in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void startPage(org.apache.pdfbox.pdmodel.PDPage page)
throws IOException
startPage in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void startDocument(org.apache.pdfbox.pdmodel.PDDocument pdf)
throws IOException
startDocument in class org.apache.pdfbox.text.PDFTextStripperIOExceptionprotected void endDocument(org.apache.pdfbox.pdmodel.PDDocument pdf)
throws IOException
endDocument in class org.apache.pdfbox.text.PDFTextStripperIOExceptionpublic int getCurrentPageNo()
processPages(PDPageTree)getCurrentPageNo in class org.apache.pdfbox.text.PDFTextStripperpublic void setStartBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem)
setStartBookmark in class org.apache.pdfbox.text.PDFTextStripperpublic void setEndBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem)
setEndBookmark in class org.apache.pdfbox.text.PDFTextStripperpublic int getStartPage()
getStartPage in class org.apache.pdfbox.text.PDFTextStripperpublic void setStartPage(int startPage)
setStartPage in class org.apache.pdfbox.text.PDFTextStripperprotected void showGlyph(org.apache.pdfbox.util.Matrix textRenderingMatrix,
org.apache.pdfbox.pdmodel.font.PDFont font,
int code,
String unicode,
org.apache.pdfbox.util.Vector displacement)
throws IOException
IOExceptionprotected float computeFontHeight(org.apache.pdfbox.pdmodel.font.PDFont arg0)
throws IOException
IOExceptionCopyright © 2007–2022 The Apache Software Foundation. All rights reserved.