public class PDFMarkedContent2XHTML
extends org.apache.pdfbox.text.PDFTextStripper
This was added in Tika 1.24 as an alpha version of a text extractor that builds the text from the marked text tree and includes/normalizes some of the structural tags.
Modifier and Type | Field and Description |
---|---|
static String |
XMP_DOCUMENT_CATALOG_LOCATION |
static String |
XMP_PAGE_LOCATION_PREFIX |
Modifier and Type | Method and Description |
---|---|
protected float |
computeFontHeight(org.apache.pdfbox.pdmodel.font.PDFont arg0) |
protected void |
endDocument(org.apache.pdfbox.pdmodel.PDDocument pdf) |
protected void |
endPage(org.apache.pdfbox.pdmodel.PDPage page) |
int |
getCurrentPageNo()
we need to override this because we are overriding
processPages(PDPageTree) |
int |
getStartPage() |
static void |
process(org.apache.pdfbox.pdmodel.PDDocument pdDocument,
ContentHandler handler,
ParseContext context,
Metadata metadata,
PDFParserConfig config)
Converts the given PDF document (and related metadata) to a stream
of XHTML SAX events sent to the given content handler.
|
void |
processPage(org.apache.pdfbox.pdmodel.PDPage page) |
protected void |
processPages(org.apache.pdfbox.pdmodel.PDPageTree pages)
See TIKA-2845 for why we need to override this.
|
void |
setEndBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem) |
void |
setStartBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem) |
void |
setStartPage(int startPage) |
protected void |
showGlyph(org.apache.pdfbox.util.Matrix textRenderingMatrix,
org.apache.pdfbox.pdmodel.font.PDFont font,
int code,
String unicode,
org.apache.pdfbox.util.Vector displacement) |
protected void |
startDocument(org.apache.pdfbox.pdmodel.PDDocument pdf) |
protected void |
startPage(org.apache.pdfbox.pdmodel.PDPage page) |
protected void |
writeCharacters(org.apache.pdfbox.text.TextPosition text) |
protected void |
writeLineSeparator() |
protected void |
writeParagraphEnd() |
protected void |
writeParagraphStart() |
protected void |
writeString(String text) |
protected void |
writeWordSeparator() |
endArticle, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCharactersByArticle, getDropThreshold, getEndBookmark, getEndPage, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getSuppressDuplicateOverlappingText, getText, getWordSeparator, matchPattern, processTextPosition, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndPage, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageStart, setParagraphEnd, setParagraphStart, setShouldSeparateByBeads, setSortByPosition, setSpacingTolerance, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startArticle, writePage, writePageEnd, writePageStart, writeParagraphSeparator, writeString, writeText
addOperator, applyTextAdjustment, beginMarkedContentSequence, beginText, decreaseLevel, endMarkedContentSequence, endText, getAppearance, getCurrentPage, getGraphicsStackSize, getGraphicsState, getInitialMatrix, getLevel, getResources, getTextLineMatrix, getTextMatrix, increaseLevel, operatorException, processAnnotation, processChildStream, processOperator, processOperator, processSoftMask, processTilingPattern, processTilingPattern, processTransparencyGroup, processType3Stream, registerOperatorProcessor, restoreGraphicsStack, restoreGraphicsState, saveGraphicsStack, saveGraphicsState, setLineDashPattern, setTextLineMatrix, setTextMatrix, showAnnotation, showFontGlyph, showFontGlyph, showForm, showGlyph, showText, showTextString, showTextStrings, showTransparencyGroup, showType3Glyph, showType3Glyph, transformedPoint, transformWidth, unsupportedOperator
public static final String XMP_DOCUMENT_CATALOG_LOCATION
public static final String XMP_PAGE_LOCATION_PREFIX
public static void process(org.apache.pdfbox.pdmodel.PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException
pdDocument
- PDF documenthandler
- SAX content handlermetadata
- PDF metadataSAXException
- if the content handler fails to process SAX eventsTikaException
- if there was an exception outside of per page processingprotected void processPages(org.apache.pdfbox.pdmodel.PDPageTree pages) throws IOException
IOException
public void processPage(org.apache.pdfbox.pdmodel.PDPage page) throws IOException
processPage
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void endPage(org.apache.pdfbox.pdmodel.PDPage page) throws IOException
IOException
protected void writeParagraphStart() throws IOException
writeParagraphStart
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void writeParagraphEnd() throws IOException
writeParagraphEnd
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void writeString(String text) throws IOException
writeString
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void writeCharacters(org.apache.pdfbox.text.TextPosition text) throws IOException
writeCharacters
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void writeWordSeparator() throws IOException
writeWordSeparator
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void writeLineSeparator() throws IOException
writeLineSeparator
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void startPage(org.apache.pdfbox.pdmodel.PDPage page) throws IOException
startPage
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void startDocument(org.apache.pdfbox.pdmodel.PDDocument pdf) throws IOException
startDocument
in class org.apache.pdfbox.text.PDFTextStripper
IOException
protected void endDocument(org.apache.pdfbox.pdmodel.PDDocument pdf) throws IOException
endDocument
in class org.apache.pdfbox.text.PDFTextStripper
IOException
public int getCurrentPageNo()
processPages(PDPageTree)
getCurrentPageNo
in class org.apache.pdfbox.text.PDFTextStripper
public void setStartBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem)
setStartBookmark
in class org.apache.pdfbox.text.PDFTextStripper
public void setEndBookmark(org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem pdOutlineItem)
setEndBookmark
in class org.apache.pdfbox.text.PDFTextStripper
public int getStartPage()
getStartPage
in class org.apache.pdfbox.text.PDFTextStripper
public void setStartPage(int startPage)
setStartPage
in class org.apache.pdfbox.text.PDFTextStripper
protected void showGlyph(org.apache.pdfbox.util.Matrix textRenderingMatrix, org.apache.pdfbox.pdmodel.font.PDFont font, int code, String unicode, org.apache.pdfbox.util.Vector displacement) throws IOException
IOException
protected float computeFontHeight(org.apache.pdfbox.pdmodel.font.PDFont arg0) throws IOException
IOException
Copyright © 2007–2023 The Apache Software Foundation. All rights reserved.