public class PDFParserConfig extends Object implements Serializable
Modifier and Type | Class and Description |
---|---|
static class |
PDFParserConfig.OCR_STRATEGY |
Constructor and Description |
---|
PDFParserConfig() |
PDFParserConfig(InputStream is)
Loads properties from InputStream and then tries to close InputStream.
|
Modifier and Type | Method and Description |
---|---|
void |
configure(org.apache.tika.parser.pdf.PDF2XHTML pdf2XHTML)
Configures the given pdf2XHTML.
|
boolean |
equals(Object o) |
AccessChecker |
getAccessChecker() |
Float |
getAverageCharTolerance() |
boolean |
getCatchIntermediateIOExceptions()
|
boolean |
getDetectAngles() |
Float |
getDropThreshold() |
boolean |
getEnableAutoSpace() |
boolean |
getExtractAcroFormContent() |
boolean |
getExtractActions() |
boolean |
getExtractAnnotationText() |
boolean |
getExtractBookmarksText() |
boolean |
getExtractFontNames() |
boolean |
getExtractInlineImages() |
boolean |
getExtractMarkedContent() |
boolean |
getExtractUniqueInlineImagesOnly() |
boolean |
getIfXFAExtractOnlyXFA() |
long |
getMaxMainMemoryBytes()
The maximum amount of memory to use when loading a pdf into a PDDocument.
|
int |
getOcrDPI()
Dots per inch used to render the page image for OCR
|
String |
getOcrImageFormatName()
String representation of the image format used to render
the page image for OCR (examples: png, tiff, jpeg)
|
float |
getOcrImageQuality()
Image quality used to render the page image for OCR.
|
float |
getOcrImageScale()
Deprecated.
as of Tika 1.23, this is no longer used in rendering page images; use
setOcrDPI(int) |
org.apache.pdfbox.rendering.ImageType |
getOcrImageType()
Image type used to render the page image for OCR.
|
PDFParserConfig.OCR_STRATEGY |
getOcrStrategy() |
boolean |
getSetKCMS() |
boolean |
getSortByPosition() |
Float |
getSpacingTolerance() |
boolean |
getSuppressDuplicateOverlappingText() |
int |
hashCode() |
boolean |
isCatchIntermediateIOExceptions()
Deprecated.
|
void |
setAccessChecker(AccessChecker accessChecker) |
void |
setAverageCharTolerance(Float averageCharTolerance)
See
PDFTextStripper.setAverageCharTolerance(float) |
void |
setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions)
The PDFBox parser will throw an IOException if there is
a problem with a stream.
|
void |
setDetectAngles(boolean detectAngles) |
void |
setDropThreshold(float dropThreshold) |
void |
setEnableAutoSpace(boolean enableAutoSpace)
If true (the default), the parser should estimate
where spaces should be inserted between words.
|
void |
setExtractAcroFormContent(boolean extractAcroFormContent)
If true (the default), extract content from AcroForms
at the end of the document.
|
void |
setExtractActions(boolean v)
Whether or not to extract PDActions from the file.
|
void |
setExtractAnnotationText(boolean extractAnnotationText)
If true (the default), text in annotations will be
extracted.
|
void |
setExtractBookmarksText(boolean extractBookmarksText)
If true, extract bookmarks (document outline) text.
|
void |
setExtractFontNames(boolean extractFontNames)
Extract font names into a metadata field
|
void |
setExtractInlineImages(boolean extractInlineImages)
If true, extract inline embedded OBXImages.
|
void |
setExtractMarkedContent(boolean extractMarkedContent)
If the PDF contains marked content, try to extract text and its marked structure.
|
void |
setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly)
Multiple pages within a PDF file might refer to the same underlying image.
|
void |
setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA)
If false (the default), extract content from the full PDF
as well as the XFA form.
|
void |
setMaxMainMemoryBytes(int maxMainMemoryBytes)
Deprecated.
|
void |
setMaxMainMemoryBytes(long maxMainMemoryBytes) |
void |
setOcrDPI(int ocrDPI)
Dots per inch used to render the page image for OCR.
|
void |
setOcrImageFormatName(String ocrImageFormatName) |
void |
setOcrImageQuality(float ocrImageQuality)
Image quality used to render the page image for OCR.
|
void |
setOcrImageScale(float ocrImageScale)
Deprecated.
(as of Tika 1.23, this is no longer used in rendering page images)
|
void |
setOcrImageType(org.apache.pdfbox.rendering.ImageType ocrImageType)
Image type used to render the page image for OCR.
|
void |
setOcrImageType(String ocrImageTypeString)
Image type used to render the page image for OCR.
|
void |
setOcrStrategy(PDFParserConfig.OCR_STRATEGY ocrStrategy)
Which strategy to use for OCR
|
void |
setOcrStrategy(String ocrStrategyString)
Which strategy to use for OCR
|
void |
setSetKCMS(boolean setKCMS)
Whether to call
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider") . |
void |
setSortByPosition(boolean sortByPosition)
If true, sort text tokens by their x/y position
before extracting text.
|
void |
setSpacingTolerance(Float spacingTolerance)
See
PDFTextStripper.setSpacingTolerance(float) |
void |
setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
If true, the parser should try to remove duplicated
text over the same region.
|
String |
toString() |
public PDFParserConfig()
public PDFParserConfig(InputStream is)
is
- public void setExtractMarkedContent(boolean extractMarkedContent)
extractMarkedContent
- public boolean getExtractMarkedContent()
public void configure(org.apache.tika.parser.pdf.PDF2XHTML pdf2XHTML)
pdf2XHTML
- public boolean getExtractAcroFormContent()
setExtractAcroFormContent(boolean)
public void setExtractAcroFormContent(boolean extractAcroFormContent)
extractAcroFormContent
- public boolean getIfXFAExtractOnlyXFA()
setIfXFAExtractOnlyXFA(boolean)
public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA)
ifXFAExtractOnlyXFA
- public boolean getExtractBookmarksText()
setExtractBookmarksText(boolean)
public void setExtractBookmarksText(boolean extractBookmarksText)
true
extractBookmarksText
- public void setExtractFontNames(boolean extractFontNames)
extractFontNames
- public boolean getExtractFontNames()
public boolean getExtractInlineImages()
setExtractInlineImages(boolean)
public void setExtractInlineImages(boolean extractInlineImages)
true
with caution.
The default is false
.
extractInlineImages
- setExtractUniqueInlineImagesOnly(boolean)
public boolean getExtractUniqueInlineImagesOnly()
public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly)
extractUniqueInlineImagesOnly
is set to false
, the
parser will call the EmbeddedExtractor each time the image appears on a page.
This might be desired for some use cases. However, to avoid duplication of
extracted images, set this to true
. The default is true
.
Note that uniqueness is determined only by the underlying PDF COSObject id, not by
file hash or similar equality metric.
If the PDF actually contains multiple copies of the same image
-- all with different object ids -- then all images will be extracted.
For this parameter to have any effect, extractInlineImages
must be
set to true
.
Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting of this parameter, the extractor will only pull out one copy of each image per page. This parameter tries to capture uniqueness across the entire document.
extractUniqueInlineImagesOnly
- public boolean getEnableAutoSpace()
setEnableAutoSpace(boolean)
public void setEnableAutoSpace(boolean enableAutoSpace)
public boolean getSuppressDuplicateOverlappingText()
public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
public boolean getExtractAnnotationText()
setExtractAnnotationText(boolean)
public void setExtractAnnotationText(boolean extractAnnotationText)
public boolean getSortByPosition()
setSortByPosition(boolean)
public void setSortByPosition(boolean sortByPosition)
public Float getAverageCharTolerance()
setAverageCharTolerance(Float)
public void setAverageCharTolerance(Float averageCharTolerance)
PDFTextStripper.setAverageCharTolerance(float)
public Float getSpacingTolerance()
setSpacingTolerance(Float)
public void setSpacingTolerance(Float spacingTolerance)
PDFTextStripper.setSpacingTolerance(float)
public Float getDropThreshold()
public void setDropThreshold(float dropThreshold)
public AccessChecker getAccessChecker()
public void setAccessChecker(AccessChecker accessChecker)
public boolean isCatchIntermediateIOExceptions()
getCatchIntermediateIOExceptions()
public boolean getCatchIntermediateIOExceptions()
public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions)
true
,
Tika's PDFParser will catch these exceptions and try to parse
the rest of the document. After the parse is completed,
Tika's PDFParser will throw the first caught exception.catchIntermediateIOExceptions
- public void setOcrStrategy(PDFParserConfig.OCR_STRATEGY ocrStrategy)
ocrStrategy
- public void setOcrStrategy(String ocrStrategyString)
ocrStrategyString
- public PDFParserConfig.OCR_STRATEGY getOcrStrategy()
public String getOcrImageFormatName()
public void setOcrImageFormatName(String ocrImageFormatName)
ocrImageFormatName
- name of image format used to render
page imagegetOcrImageFormatName()
public org.apache.pdfbox.rendering.ImageType getOcrImageType()
setOcrImageType(ImageType)
public void setOcrImageType(org.apache.pdfbox.rendering.ImageType ocrImageType)
ocrImageType
- public void setOcrImageType(String ocrImageTypeString)
setOcrImageType(ImageType)
public int getOcrDPI()
public void setOcrDPI(int ocrDPI)
ocrDPI
- public float getOcrImageQuality()
public void setOcrImageQuality(float ocrImageQuality)
public float getOcrImageScale()
setOcrDPI(int)
public void setOcrImageScale(float ocrImageScale)
ocrImageScale
- public void setExtractActions(boolean v)
v
- public boolean getExtractActions()
setExtractActions(boolean)
public long getMaxMainMemoryBytes()
@Deprecated public void setMaxMainMemoryBytes(int maxMainMemoryBytes)
setMaxMainMemoryBytes(long)
maxMainMemoryBytes
- public void setMaxMainMemoryBytes(long maxMainMemoryBytes)
public void setSetKCMS(boolean setKCMS)
Whether to call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
.
KCMS is the unmaintained, legacy provider and is far faster than the newer replacement.
However, there are stability and security risks with using the unmaintained legacy provider.
Note, of course, that this is not thread safe. If the value is false
in your first thread, and the second thread changes this to true
,
the system property in the first thread will now be true
.
Default is false
.
setKCMS
- whether or not to set KCMSpublic boolean getSetKCMS()
public void setDetectAngles(boolean detectAngles)
public boolean getDetectAngles()
Copyright © 2007–2021 The Apache Software Foundation. All rights reserved.