Serialized Form
-
Package org.apache.tika
-
Class org.apache.tika.DeleteFetcherReply
class DeleteFetcherReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
memoizedIsInitialized
byte memoizedIsInitialized
-
success_
boolean success_
-
-
Class org.apache.tika.DeleteFetcherRequest
class DeleteFetcherRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
fetcherId_
Object fetcherId_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.DeletePipesIteratorReply
class DeletePipesIteratorReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
memoizedIsInitialized
byte memoizedIsInitialized
-
message_
Object message_
-
-
Class org.apache.tika.DeletePipesIteratorRequest
class DeletePipesIteratorRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
iteratorId_
Object iteratorId_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.FetchAndParseReply
class FetchAndParseReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Class org.apache.tika.FetchAndParseRequest
class FetchAndParseRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Class org.apache.tika.GetFetcherConfigJsonSchemaReply
class GetFetcherConfigJsonSchemaReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
fetcherConfigJsonSchema_
Object fetcherConfigJsonSchema_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.GetFetcherConfigJsonSchemaRequest
class GetFetcherConfigJsonSchemaRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
fetcherClass_
Object fetcherClass_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.GetFetcherReply
class GetFetcherReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Class org.apache.tika.GetFetcherRequest
class GetFetcherRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
fetcherId_
Object fetcherId_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.GetPipesIteratorReply
class GetPipesIteratorReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Class org.apache.tika.GetPipesIteratorRequest
class GetPipesIteratorRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
iteratorId_
Object iteratorId_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.ListFetchersReply
class ListFetchersReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
getFetcherReplies_
List<GetFetcherReply> getFetcherReplies_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.ListFetchersRequest
class ListFetchersRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
memoizedIsInitialized
byte memoizedIsInitialized
-
numFetchersPerPage_
int numFetchersPerPage_
-
pageNumber_
int pageNumber_
-
-
Class org.apache.tika.SaveFetcherReply
class SaveFetcherReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
fetcherId_
Object fetcherId_
-
memoizedIsInitialized
byte memoizedIsInitialized
-
-
Class org.apache.tika.SaveFetcherRequest
class SaveFetcherRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Class org.apache.tika.SavePipesIteratorReply
class SavePipesIteratorReply extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
Serialized Fields
-
memoizedIsInitialized
byte memoizedIsInitialized
-
message_
Object message_
-
-
Class org.apache.tika.SavePipesIteratorRequest
class SavePipesIteratorRequest extends com.google.protobuf.GeneratedMessageV3 implements Serializable- serialVersionUID:
- 0L
-
-
Package org.apache.tika.client
-
Exception org.apache.tika.client.TikaClientException
class TikaClientException extends TikaException implements Serializable
-
-
Package org.apache.tika.config
-
Class org.apache.tika.config.EmbeddedLimits
class EmbeddedLimits extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
maxCount
int maxCount
-
maxDepth
int maxDepth
-
throwOnMaxCount
boolean throwOnMaxCount
-
throwOnMaxDepth
boolean throwOnMaxDepth
-
-
Class org.apache.tika.config.OutputLimits
class OutputLimits extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
maxPackageEntryDepth
int maxPackageEntryDepth
-
maxXmlDepth
int maxXmlDepth
-
throwOnWriteLimit
boolean throwOnWriteLimit
-
writeLimit
int writeLimit
-
zipBombRatio
long zipBombRatio
-
zipBombThreshold
long zipBombThreshold
-
-
Class org.apache.tika.config.TimeoutLimits
class TimeoutLimits extends Object implements Serializable- serialVersionUID:
- 2L
-
Serialized Fields
-
progressTimeoutMillis
long progressTimeoutMillis
-
totalTaskTimeoutMillis
long totalTaskTimeoutMillis
-
-
-
Package org.apache.tika.detect
-
Class org.apache.tika.detect.BOMDetector
class BOMDetector extends Object implements Serializable -
Class org.apache.tika.detect.CompositeDetector
class CompositeDetector extends Object implements Serializable- serialVersionUID:
- 5980683158436430252L
-
Serialized Fields
-
detectors
List<Detector> detectors
-
registry
MediaTypeRegistry registry
-
-
Class org.apache.tika.detect.CompositeEncodingDetector
class CompositeEncodingDetector extends Object implements Serializable- serialVersionUID:
- 5980683158436430252L
-
Serialized Fields
-
baseDetectors
List<EncodingDetector> baseDetectors
-
detectors
List<EncodingDetector> detectors
-
metaDetector
MetaEncodingDetector metaDetector
-
-
Class org.apache.tika.detect.DefaultDetector
class DefaultDetector extends CompositeDetector implements Serializable- serialVersionUID:
- -8170114575326908027L
-
Serialized Fields
-
excludedClasses
Collection<Class<? extends Detector>> excludedClasses
-
mimeTypes
MimeTypes mimeTypes
-
textDetector
TextDetector textDetector
-
-
Class org.apache.tika.detect.DefaultEncodingDetector
class DefaultEncodingDetector extends CompositeEncodingDetector implements Serializable -
Class org.apache.tika.detect.DefaultProbDetector
class DefaultProbDetector extends CompositeDetector implements Serializable- serialVersionUID:
- -8836240060532323352L
-
Class org.apache.tika.detect.EmptyDetector
class EmptyDetector extends Object implements Serializable -
Class org.apache.tika.detect.FileCommandDetector
class FileCommandDetector extends Object implements Serializable -
Class org.apache.tika.detect.MagicDetector
class MagicDetector extends Object implements Serializable-
Serialized Fields
-
isRegex
boolean isRegex
True if pattern is a regular expression, false otherwise. -
isStringIgnoreCase
boolean isStringIgnoreCase
True if we're doing a case-insensitive string match, false otherwise. -
length
int length
Length of the comparison window. -
mask
byte[] mask
Bit mask that is applied to the source bytes before pattern matching. -
offsetRangeBegin
int offsetRangeBegin
First offset (inclusive) of the comparison window within the document input stream. Greater than or equal to zero. -
offsetRangeEnd
int offsetRangeEnd
Last offset (inclusive) of the comparison window within the document input stream. Greater than or equal to thefirst offset.Note that this is not the offset of the last byte read from the document stream. Instead, the last window of bytes to be compared starts at this offset.
-
pattern
byte[] pattern
The magic match pattern. If this byte pattern is equal to the possibly bit-masked bytes from the input stream, then the type detection succeeds and the configuredMagicDetector.typeis returned. -
patternLength
int patternLength
Length of the pattern, which in the case of regular expressions will not be the same as the comparison window length. -
type
MediaType type
The matching media type. Returned by the#detect(InputStream, Metadata)method if a match is found.
-
-
-
Class org.apache.tika.detect.MatroskaDetector
class MatroskaDetector extends Object implements Serializable- serialVersionUID:
- 1L
-
Class org.apache.tika.detect.MetadataCharsetDetector
class MetadataCharsetDetector extends Object implements Serializable -
Class org.apache.tika.detect.NameDetector
class NameDetector extends Object implements Serializable -
Class org.apache.tika.detect.NNExampleModelDetector
class NNExampleModelDetector extends TrainedModelDetector implements Serializable- serialVersionUID:
- 1L
-
Class org.apache.tika.detect.OverrideDetector
class OverrideDetector extends Object implements Serializable -
Class org.apache.tika.detect.OverrideEncodingDetector
class OverrideEncodingDetector extends Object implements Serializable-
Serialized Fields
-
charset
Charset charset
-
-
-
Class org.apache.tika.detect.TextDetector
class TextDetector extends Object implements Serializable- serialVersionUID:
- 4774601079503507765L
-
Serialized Fields
-
bytesToTest
int bytesToTest
-
-
Class org.apache.tika.detect.TrainedModelDetector
class TrainedModelDetector extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
MODEL_MAP
Map<MediaType,
TrainedModel> MODEL_MAP
-
-
Class org.apache.tika.detect.TypeDetector
class TypeDetector extends Object implements Serializable -
Class org.apache.tika.detect.ZeroSizeFileDetector
class ZeroSizeFileDetector extends Object implements Serializable
-
-
Package org.apache.tika.detect.apple
-
Class org.apache.tika.detect.apple.BPListDetector
class BPListDetector extends Object implements Serializable -
Class org.apache.tika.detect.apple.IWorkDetector
class IWorkDetector extends Object implements Serializable
-
-
Package org.apache.tika.detect.gzip
-
Class org.apache.tika.detect.gzip.GZipSpecializationDetector
class GZipSpecializationDetector extends Object implements Serializable
-
-
Package org.apache.tika.detect.magika
-
Class org.apache.tika.detect.magika.MagikaDetector
class MagikaDetector extends Object implements Serializable-
Serialized Fields
-
defaultConfig
MagikaDetector.Config defaultConfig
-
hasMagika
Boolean hasMagika
-
-
-
-
Package org.apache.tika.detect.microsoft
-
Class org.apache.tika.detect.microsoft.POIFSContainerDetector
class POIFSContainerDetector extends Object implements Serializable- serialVersionUID:
- -3028021741663605293L
-
-
Package org.apache.tika.detect.microsoft.ooxml
-
Class org.apache.tika.detect.microsoft.ooxml.OPCPackageDetector
class OPCPackageDetector extends Object implements Serializable
-
-
Package org.apache.tika.detect.ogg
-
Class org.apache.tika.detect.ogg.OggDetector
class OggDetector extends Object implements Serializable- serialVersionUID:
- 591382028699008553L
-
-
Package org.apache.tika.detect.ole
-
Class org.apache.tika.detect.ole.MiscOLEDetector
class MiscOLEDetector extends Object implements Serializable
-
-
Package org.apache.tika.detect.siegfried
-
Class org.apache.tika.detect.siegfried.SiegfriedDetector
class SiegfriedDetector extends Object implements Serializable-
Serialized Fields
-
defaultConfig
SiegfriedDetector.Config defaultConfig
-
hasSiegfriedCommand
Boolean hasSiegfriedCommand
-
-
-
-
Package org.apache.tika.detect.zip
-
Class org.apache.tika.detect.zip.DefaultZipContainerDetector
class DefaultZipContainerDetector extends Object implements Serializable- serialVersionUID:
- 2891763938430295453L
-
Serialized Fields
-
staticZipDetectors
List<ZipContainerDetector> staticZipDetectors
-
-
Class org.apache.tika.detect.zip.DeprecatedStreamingZipContainerDetector
class DeprecatedStreamingZipContainerDetector extends org.apache.tika.detect.zip.ZipContainerDetectorBase implements Serializable -
Class org.apache.tika.detect.zip.FrictionlessPackageDetector
class FrictionlessPackageDetector extends Object implements Serializable -
Class org.apache.tika.detect.zip.IPADetector
class IPADetector extends Object implements Serializable -
Class org.apache.tika.detect.zip.JarDetector
class JarDetector extends Object implements Serializable -
Class org.apache.tika.detect.zip.KMZDetector
class KMZDetector extends Object implements Serializable -
Class org.apache.tika.detect.zip.OpenDocumentDetector
class OpenDocumentDetector extends Object implements Serializable -
Class org.apache.tika.detect.zip.StarOfficeDetector
class StarOfficeDetector extends Object implements Serializable -
Class org.apache.tika.detect.zip.StreamingZipContainerDetector
class StreamingZipContainerDetector extends DefaultZipContainerDetector implements Serializable- serialVersionUID:
- 2891763938430295453L
-
-
Package org.apache.tika.embedder
-
Class org.apache.tika.embedder.ExternalEmbedder
class ExternalEmbedder extends Object implements Serializable- serialVersionUID:
- -2828829275642475697L
-
Serialized Fields
-
command
String[] command
The external command to invoke.- See Also:
-
commandAppendOperator
String commandAppendOperator
-
commandAssignmentDelimeter
String commandAssignmentDelimeter
-
commandAssignmentOperator
String commandAssignmentOperator
-
metadataCommandArguments
Map<Property,
String[]> metadataCommandArguments Mapping of Tika metadata to command line parameters. -
quoteAssignmentValues
boolean quoteAssignmentValues
-
supportedEmbedTypes
Set<MediaType> supportedEmbedTypes
Media types supported by the external program. -
tmp
TemporaryResources tmp
-
-
-
Package org.apache.tika.eval.app.io
-
Exception org.apache.tika.eval.app.io.ExtractReaderException
class ExtractReaderException extends IOException implements Serializable-
Serialized Fields
-
type
ExtractReaderException.TYPE type
-
-
-
-
Package org.apache.tika.eval.core.metadata
-
Class org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter
class TikaEvalMetadataFilter extends MetadataFilterBase implements Serializable
-
-
Package org.apache.tika.example
-
Class org.apache.tika.example.DirListParser
class DirListParser extends Object implements Serializable- serialVersionUID:
- 2717930544410610735L
-
Class org.apache.tika.example.EncryptedPrescriptionDetector
class EncryptedPrescriptionDetector extends Object implements Serializable- serialVersionUID:
- -1709652690773421147L
-
Class org.apache.tika.example.EncryptedPrescriptionParser
class EncryptedPrescriptionParser extends Object implements Serializable- serialVersionUID:
- -7816987249611278541L
-
Class org.apache.tika.example.LanguageDetectingParser
class LanguageDetectingParser extends DelegatingParser implements Serializable- serialVersionUID:
- 4291320409396502774L
-
Class org.apache.tika.example.PickBestTextEncodingParser
class PickBestTextEncodingParser extends AbstractMultipleParser implements Serializable- serialVersionUID:
- 730345169223211807L
-
Serialized Fields
-
charsetsToTry
String[] charsetsToTry
Deprecated.Which charsets we should try
-
-
Class org.apache.tika.example.PickBestTextEncodingParser.CharsetContentHandlerFactory
class CharsetContentHandlerFactory extends Object implements Serializable-
Serialized Fields
-
handler
ContentHandler handler
-
index
int index
-
-
-
Class org.apache.tika.example.PrescriptionParser
class PrescriptionParser extends XMLParser implements Serializable- serialVersionUID:
- 7690682277511967388L
-
-
Package org.apache.tika.exception
-
Exception org.apache.tika.exception.AccessPermissionException
class AccessPermissionException extends TikaException implements Serializable -
Exception org.apache.tika.exception.CorruptedFileException
class CorruptedFileException extends TikaException implements Serializable -
Exception org.apache.tika.exception.EmbeddedLimitReachedException
class EmbeddedLimitReachedException extends RuntimeException implements Serializable-
Serialized Fields
-
limit
int limit
-
limitType
EmbeddedLimitReachedException.LimitType limitType
-
-
-
Exception org.apache.tika.exception.EncryptedDocumentException
class EncryptedDocumentException extends TikaException implements Serializable -
Exception org.apache.tika.exception.FileTooLongException
class FileTooLongException extends IOException implements Serializable -
Exception org.apache.tika.exception.RuntimeSAXException
class RuntimeSAXException extends RuntimeException implements Serializable -
Exception org.apache.tika.exception.TikaConfigException
class TikaConfigException extends TikaException implements Serializable -
Exception org.apache.tika.exception.TikaException
class TikaException extends Exception implements Serializable -
Exception org.apache.tika.exception.TikaMemoryLimitException
class TikaMemoryLimitException extends TikaException implements Serializable -
Exception org.apache.tika.exception.TikaTimeoutException
class TikaTimeoutException extends RuntimeException implements Serializable -
Exception org.apache.tika.exception.UnsupportedFormatException
class UnsupportedFormatException extends TikaException implements Serializable -
Exception org.apache.tika.exception.WriteLimitReachedException
class WriteLimitReachedException extends SAXException implements Serializable-
Serialized Fields
-
writeLimit
int writeLimit
-
-
-
Exception org.apache.tika.exception.ZeroByteFileException
class ZeroByteFileException extends TikaException implements Serializable
-
-
Package org.apache.tika.extractor
-
Class org.apache.tika.extractor.EmbeddedDocumentUtil
class EmbeddedDocumentUtil extends Object implements Serializable-
Serialized Fields
-
context
ParseContext context
-
detector
Detector detector
-
embeddedDocumentExtractor
EmbeddedDocumentExtractor embeddedDocumentExtractor
-
mimeTypes
MimeTypes mimeTypes
-
-
-
Class org.apache.tika.extractor.ParserContainerExtractor
class ParserContainerExtractor extends Object implements Serializable- serialVersionUID:
- 2261131045580861514L
-
Class org.apache.tika.extractor.StandardExtractorFactory
class StandardExtractorFactory extends Object implements Serializable
-
-
Package org.apache.tika.gui
-
Class org.apache.tika.gui.TikaGUI
class TikaGUI extends JFrame implements Serializable- serialVersionUID:
- 5883906936187059495L
-
Serialized Fields
-
cards
JPanel cards
Container for the editor cards. -
chooser
JFileChooser chooser
File chooser. -
context
ParseContext context
Parsing context. -
html
JEditorPane html
Formatted XHTML output. -
imageParser
org.apache.tika.gui.TikaGUI.ImageSavingParser imageParser
Captures requested embedded images -
json
JEditorPane json
Raw JSON source. -
layout
CardLayout layout
The card layout for switching between different views. -
metadata
JEditorPane metadata
Document metadata. -
parser
Parser parser
Configured parser instance. -
text
JEditorPane text
Plain text output. -
textMain
JEditorPane textMain
Main content output. -
tikaConfig
TikaLoader tikaConfig
-
xml
JEditorPane xml
Raw XHTML source.
-
-
-
Package org.apache.tika.inference
-
Class org.apache.tika.inference.AbstractEmbeddingFilter
class AbstractEmbeddingFilter extends MetadataFilter implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
defaultConfig
InferenceConfig defaultConfig
-
-
Class org.apache.tika.inference.ImageEmbeddingConfig
class ImageEmbeddingConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
apiKey
String apiKey
-
baseUrl
String baseUrl
-
maxFileSizeToEmbed
long maxFileSizeToEmbed
-
minFileSizeToEmbed
long minFileSizeToEmbed
-
model
String model
-
outputField
String outputField
Metadata field to store the serialized chunk JSON containing the image vector and locators. Defaults to the canonical chunks field so image embeddings merge with text chunks in a single array. -
skipEmbedding
boolean skipEmbedding
-
timeoutSeconds
int timeoutSeconds
-
-
Class org.apache.tika.inference.ImageEmbeddingConfig.RuntimeConfig
class RuntimeConfig extends ImageEmbeddingConfig implements Serializable -
Class org.apache.tika.inference.InferenceConfig
class InferenceConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
apiKey
String apiKey
Optional API key. Empty means no auth. -
baseUrl
String baseUrl
Base URL of the embeddings API (no trailing slash). -
clearContentAfterChunking
boolean clearContentAfterChunking
Iftrue, the content field (defaulttika:content) is removed from metadata after chunking and embedding. This avoids storing the full text twice (once as raw content, once inside the chunks). Default isfalse. -
contentField
String contentField
The metadata field to read the source text from. Defaults totika:content. -
maxBatchSize
int maxBatchSize
Maximum number of chunk texts to send in a single embeddings API request. If a document produces more chunks than this, the filter splits them into multiple HTTP calls.OpenAI's embeddings endpoint caps at 2048 inputs per request; the default here (256) is a safe value that works across most providers while keeping request sizes reasonable.
-
maxChunkChars
int maxChunkChars
Maximum number of characters per chunk. The chunker will try to break at markdown heading or paragraph boundaries before hitting this limit. -
maxChunks
int maxChunks
Maximum number of chunks to produce per document. If a document's text generates more chunks than this, excess chunks are silently dropped. This prevents pathologically large documents from triggering an unbounded number of embedding API calls.Default is 1024. Set to
-1for no limit. -
model
String model
Model identifier sent in the embeddings request. -
outputField
String outputField
The metadata field where the JSON chunk array is written. -
overlapChars
int overlapChars
Number of characters of overlap between consecutive chunks. Helps ensure no context is lost at chunk boundaries. -
skipEmbedding
boolean skipEmbedding
Iftrue, the embedding filter is skipped entirely for this request. Useful when the filter is configured as the default filter but should be bypassed for specific documents (e.g. binary blobs, very short metadata-only records). Set viaParseContextJSON:{"openai-embedding-filter": {"skipEmbedding": true}}. Default isfalse. -
timeoutSeconds
int timeoutSeconds
HTTP read timeout in seconds.
-
-
Class org.apache.tika.inference.InferenceConfig.RuntimeConfig
class RuntimeConfig extends InferenceConfig implements Serializable -
Class org.apache.tika.inference.OpenAIEmbeddingFilter
class OpenAIEmbeddingFilter extends AbstractEmbeddingFilter implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
apiKeyHeaderName
String apiKeyHeaderName
HTTP header name for API key auth. Default:Authorization. -
apiKeyPrefix
String apiKeyPrefix
Prefix before API key value. Default:"Bearer ". -
embeddingsPath
String embeddingsPath
URL path appended tobaseUrlfor embeddings requests. Default:/v1/embeddings. For Azure OpenAI, set to/openai/deployments/{deployment}/embeddings?api-version=2024-02-01.
-
-
Class org.apache.tika.inference.OpenAIImageEmbeddingParser
class OpenAIImageEmbeddingParser extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
apiKeyHeaderName
String apiKeyHeaderName
HTTP header name for API key auth. Default:Authorization. -
apiKeyPrefix
String apiKeyPrefix
Prefix before API key value. Default:"Bearer ". -
defaultConfig
ImageEmbeddingConfig defaultConfig
-
embeddingsPath
String embeddingsPath
URL path for embeddings requests. Default:/v1/embeddings.
-
-
-
Package org.apache.tika.io
-
Exception org.apache.tika.io.EndianUtils.BufferUnderrunException
class BufferUnderrunException extends TikaException implements Serializable- serialVersionUID:
- 8358288231138076276L
-
-
Package org.apache.tika.langdetect.charsoup
-
Class org.apache.tika.langdetect.charsoup.CharSoupMetadataFilter
class CharSoupMetadataFilter extends MetadataFilterBase implements Serializable-
Serialized Fields
-
maxLength
int maxLength
-
-
-
-
Package org.apache.tika.langdetect.opennlp.metadatafilter
-
Class org.apache.tika.langdetect.opennlp.metadatafilter.OpenNLPMetadataFilter
class OpenNLPMetadataFilter extends MetadataFilterBase implements Serializable-
Serialized Fields
-
maxCharsForDetection
int maxCharsForDetection
-
-
-
-
Package org.apache.tika.langdetect.optimaize.metadatafilter
-
Class org.apache.tika.langdetect.optimaize.metadatafilter.OptimaizeMetadataFilter
class OptimaizeMetadataFilter extends MetadataFilterBase implements Serializable-
Serialized Fields
-
maxCharsForDetection
int maxCharsForDetection
-
-
-
-
Package org.apache.tika.metadata
-
Class org.apache.tika.metadata.Metadata
class Metadata extends Object implements Serializable- serialVersionUID:
- 5623926545693153182L
-
Serialized Fields
-
metadata
Map<String,
String[]> metadata A map of all metadata attributes. -
writeLimiter
MetadataWriteLimiter writeLimiter
-
-
Exception org.apache.tika.metadata.PropertyTypeException
class PropertyTypeException extends IllegalArgumentException implements Serializable
-
-
Package org.apache.tika.metadata.filter
-
Class org.apache.tika.metadata.filter.CaptureGroupMetadataFilter
class CaptureGroupMetadataFilter extends MetadataFilterBase implements Serializable -
Class org.apache.tika.metadata.filter.ClearByAttachmentTypeMetadataFilter
class ClearByAttachmentTypeMetadataFilter extends MetadataFilterBase implements Serializable -
Class org.apache.tika.metadata.filter.CompositeMetadataFilter
class CompositeMetadataFilter extends MetadataFilter implements Serializable-
Serialized Fields
-
filters
List<MetadataFilter> filters
-
-
-
Class org.apache.tika.metadata.filter.DateNormalizingMetadataFilter
class DateNormalizingMetadataFilter extends MetadataFilterBase implements Serializable-
Serialized Fields
-
defaultTimeZone
TimeZone defaultTimeZone
-
-
-
Class org.apache.tika.metadata.filter.DefaultMetadataFilter
class DefaultMetadataFilter extends CompositeMetadataFilter implements Serializable -
Class org.apache.tika.metadata.filter.ExcludeFieldMetadataFilter
class ExcludeFieldMetadataFilter extends MetadataFilterBase implements Serializable -
Class org.apache.tika.metadata.filter.FieldNameMappingFilter
class FieldNameMappingFilter extends MetadataFilterBase implements Serializable -
Class org.apache.tika.metadata.filter.GeoPointMetadataFilter
class GeoPointMetadataFilter extends MetadataFilterBase implements Serializable-
Serialized Fields
-
geoPointFieldName
String geoPointFieldName
-
-
-
Class org.apache.tika.metadata.filter.IncludeFieldMetadataFilter
class IncludeFieldMetadataFilter extends MetadataFilterBase implements Serializable -
Class org.apache.tika.metadata.filter.MetadataFilter
class MetadataFilter extends Object implements Serializable -
Class org.apache.tika.metadata.filter.MetadataFilterBase
class MetadataFilterBase extends MetadataFilter implements Serializable -
Class org.apache.tika.metadata.filter.NoOpFilter
class NoOpFilter extends MetadataFilter implements Serializable -
Class org.apache.tika.metadata.filter.RemoveByMimeMetadataFilter
class RemoveByMimeMetadataFilter extends MetadataFilter implements Serializable
-
-
Package org.apache.tika.metadata.writefilter
-
Class org.apache.tika.metadata.writefilter.StandardMetadataLimiter
class StandardMetadataLimiter extends Object implements Serializable-
Serialized Fields
-
estimatedSize
int estimatedSize
-
excludeFields
Set<String> excludeFields
-
fieldSizes
Map<String,
Integer> fieldSizes -
includeEmpty
boolean includeEmpty
-
includeFields
Set<String> includeFields
-
maxFieldSize
int maxFieldSize
-
maxKeySize
int maxKeySize
-
maxTotalEstimatedSize
int maxTotalEstimatedSize
-
maxValuesPerField
int maxValuesPerField
-
minimumMaxFieldSizeInAlwaysFields
int minimumMaxFieldSizeInAlwaysFields
-
-
-
-
Package org.apache.tika.mime
-
Class org.apache.tika.mime.MediaType
class MediaType extends Object implements Serializable- serialVersionUID:
- -3831000556189036392L
-
Serialized Fields
-
parameters
Map<String,
String> parameters Immutable sorted map of media type parameters. -
semicolon
int semicolon
Location of the first ";" character separating the type part ofMediaType.stringfrom possible parameters. Length ofMediaType.stringin case there are no parameters. -
slash
int slash
Location of the "/" character separating the type and the subtype tokens inMediaType.string. -
string
String string
Canonical string representation of this media type.
-
-
Class org.apache.tika.mime.MediaTypeRegistry
class MediaTypeRegistry extends Object implements Serializable- serialVersionUID:
- 4710974869988895410L
-
Serialized Fields
-
inheritance
Map<MediaType,
MediaType> inheritance Known type inheritance relationships. The mapping is from a media type to the closest supertype. -
registry
Map<MediaType,
MediaType> registry Registry of known media types, including type aliases. A canonical media type is handled as an identity mapping, while an alias is stored as a mapping from the alias to the corresponding canonical type.
-
-
Class org.apache.tika.mime.MimeType
class MimeType extends Object implements Serializable- serialVersionUID:
- 4357830439860729201L
-
Serialized Fields
-
acronym
String acronym
The MimeType acronym -
description
String description
Description of this media type. -
extensions
List<String> extensions
All known file extensions of this type, in order of preference (best first). -
isInterpreted
boolean isInterpreted
Whether this mime-type is used for server-side scripts, and thus cannot reliably be used for filename-based type detection -
links
List<URI> links
Documentation Links -
magics
List<org.apache.tika.mime.Magic> magics
The magics associated to this Mime-Type -
minLength
int minLength
The minimum length of data to provides for magic analyzis -
rootXML
List<org.apache.tika.mime.MimeType.RootXML> rootXML
The root-XML associated to this Mime-Type -
type
MediaType type
The normalized media type name. -
uti
String uti
The http://en.wikipedia.org/wiki/Uniform_Type_Identifier
-
-
Exception org.apache.tika.mime.MimeTypeException
class MimeTypeException extends TikaException implements Serializable -
Class org.apache.tika.mime.MimeTypes
class MimeTypes extends Object implements Serializable- serialVersionUID:
- -1350863170146349036L
-
Serialized Fields
-
htmlMimeType
MimeType htmlMimeType
html type, text/html -
magics
List<org.apache.tika.mime.Magic> magics
Sorted list of all registered magics -
patterns
org.apache.tika.mime.Patterns patterns
The patterns matcher -
registry
MediaTypeRegistry registry
Registered media types and their aliases. -
rootMimeType
MimeType rootMimeType
Root type, application/octet-stream. -
rootMimeTypeL
List<MimeType> rootMimeTypeL
-
textMimeType
MimeType textMimeType
Text type, text/plain. -
types
Map<MediaType,
MimeType> types All the registered MimeTypes indexed on their canonical names -
xmlMimeType
MimeType xmlMimeType
xml type, application/xml -
xmls
List<MimeType> xmls
Sorted list of all registered rootXML
-
-
Class org.apache.tika.mime.ProbabilisticMimeDetectionSelector
class ProbabilisticMimeDetectionSelector extends Object implements Serializable- serialVersionUID:
- 224589862960269260L
-
Serialized Fields
-
changeRate
float changeRate
-
extension_neg
float extension_neg
-
extension_trust
float extension_trust
-
magic_neg
float magic_neg
-
magic_trust
float magic_trust
-
meta_neg
float meta_neg
-
meta_trust
float meta_trust
-
mimeTypes
MimeTypes mimeTypes
-
priorExtensionFileType
float priorExtensionFileType
-
priorMagicFileType
float priorMagicFileType
-
priorMetaFileType
float priorMetaFileType
-
rootMediaType
MediaType rootMediaType
-
threshold
float threshold
-
-
-
Package org.apache.tika.ml.chardetect
-
Class org.apache.tika.ml.chardetect.MojibusterEncodingDetector
class MojibusterEncodingDetector extends Object implements Serializable-
Serialized Fields
-
nb
NaiveBayesBigramEncodingDetector nb
-
utf16
Utf16SpecialistEncodingDetector utf16
-
-
-
Class org.apache.tika.ml.chardetect.NaiveBayesBigramEncodingDetector
class NaiveBayesBigramEncodingDetector extends Object implements Serializable-
Serialized Fields
-
charsets
Charset[] charsets
Charset objects cached at load — oneCharset.forNameper class, ever. -
idf8
byte[] idf8
Global per-bigram IDF = log((C+1)/(df_i+1)) baked in at training, quantized to int8 at load viaidfScale = maxAbs(idf)/127. IDF is non-negative so int8 values land in [0, 127]. Zero means "bigram appears in every class, no signal" and is the hot-loop skip condition. -
labels
String[] labels
-
logP8
byte[] logP8
Bigram-major int8 logP layout. Quantized at load time via per-class scalescale[c] = maxAbs(class c's logP column) / 127. In-memory footprint:65_536 × numClassesbytes ≈ 2 MB for 32 classes, 4× smaller than float32. The hot-loop accumulates raw int8 products and applies dequantization once at the end of the probe, CharSoup-style. -
numClasses
int numClasses
-
perClassDequant
double[] perClassDequant
Per-class dequantization constant folded fromscale[c] * idfScale / logVocabSize[c]. Applied once per class at the end of the probe to convert int accumulator to the final log-score. Keepinglog V(c)in the dequant constant preserves the B-3 per-class score normalization from the float-path at zero additional cost.
-
-
-
Class org.apache.tika.ml.chardetect.Utf16SpecialistEncodingDetector
class Utf16SpecialistEncodingDetector extends Object implements Serializable-
Serialized Fields
-
extractor
Utf16ColumnFeatureExtractor extractor
-
maxProbeBytes
int maxProbeBytes
-
model
LinearModel model
-
-
-
-
Package org.apache.tika.ml.junkdetect
-
Class org.apache.tika.ml.junkdetect.JunkFilterEncodingDetector
class JunkFilterEncodingDetector extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
qualityDetector
TextQualityDetector qualityDetector
Cached quality detector.nullif none is on the classpath. -
readLimit
int readLimit
-
-
-
Package org.apache.tika.parser
-
Class org.apache.tika.parser.AbstractEncodingDetectorParser
class AbstractEncodingDetectorParser extends Object implements Serializable-
Serialized Fields
-
encodingDetector
EncodingDetector encodingDetector
-
-
-
Class org.apache.tika.parser.AbstractExternalProcessParser
class AbstractExternalProcessParser extends Object implements Serializable- serialVersionUID:
- 7186985395903074255L
-
Class org.apache.tika.parser.AbstractParser
class AbstractParser extends Object implements Serializable- serialVersionUID:
- 7186985395903074255L
-
Class org.apache.tika.parser.AutoDetectParser
class AutoDetectParser extends CompositeParser implements Serializable- serialVersionUID:
- 6110455808615143122L
-
Serialized Fields
-
autoDetectParserConfig
AutoDetectParserConfig autoDetectParserConfig
Configuration used when initializing a SecureContentHandler and the TikaInputStream. -
detector
Detector detector
The type detector used by this parser to auto-detect the type of a document.
-
-
Class org.apache.tika.parser.AutoDetectParserConfig
class AutoDetectParserConfig extends Object implements Serializable-
Serialized Fields
-
contentHandlerDecoratorFactory
ContentHandlerDecoratorFactory contentHandlerDecoratorFactory
-
throwOnZeroBytes
boolean throwOnZeroBytes
-
-
-
Class org.apache.tika.parser.CompositeParser
class CompositeParser extends Object implements Serializable- serialVersionUID:
- 2192845797749627824L
-
Serialized Fields
-
fallback
Parser fallback
The fallback parser, used when no better parser is available. -
parsers
List<Parser> parsers
List of component parsers. -
registry
MediaTypeRegistry registry
Media type registry.
-
-
Class org.apache.tika.parser.CryptoParser
class CryptoParser extends DelegatingParser implements Serializable- serialVersionUID:
- -3507995752666557731L
-
Class org.apache.tika.parser.DefaultParser
class DefaultParser extends CompositeParser implements Serializable- serialVersionUID:
- 3612324825403757520L
-
Serialized Fields
-
excludedClasses
Collection<Class<? extends Parser>> excludedClasses
-
-
Class org.apache.tika.parser.DelegatingParser
class DelegatingParser extends Object implements Serializable -
Class org.apache.tika.parser.EmptyParser
class EmptyParser extends Object implements Serializable- serialVersionUID:
- -4218649699095732123L
-
Class org.apache.tika.parser.ErrorParser
class ErrorParser extends Object implements Serializable- serialVersionUID:
- 7727423956957641824L
-
Class org.apache.tika.parser.NetworkParser
class NetworkParser extends Object implements Serializable -
Class org.apache.tika.parser.ParseContext
class ParseContext extends Object implements Serializable- serialVersionUID:
- -5921436862145826534L
-
Serialized Fields
-
context
Map<String,
Object> context Map of typed objects in this context, keyed by class name. -
jsonConfigs
Map<String,
JsonConfig> jsonConfigs Map of JSON configs, keyed by component name (e.g., "pdf-parser"). This is the source of truth for round-trip serialization. Using JsonConfig interface allows for future extension with metadata.
-
-
Class org.apache.tika.parser.ParserDecorator
class ParserDecorator extends Object implements Serializable- serialVersionUID:
- -3861669115439125268L
-
Serialized Fields
-
parser
Parser parser
The decorated parser instance.
-
-
Class org.apache.tika.parser.ParserDecorator.MimeFilteringDecorator
class MimeFilteringDecorator extends ParserDecorator implements Serializable- serialVersionUID:
- 1L
-
Class org.apache.tika.parser.ParserPostProcessor
class ParserPostProcessor extends ParserDecorator implements Serializable -
Class org.apache.tika.parser.RecursiveParserWrapper
class RecursiveParserWrapper extends ParserDecorator implements Serializable- serialVersionUID:
- 9086536568120690938L
-
Serialized Fields
-
catchEmbeddedExceptions
boolean catchEmbeddedExceptions
-
inlineContent
boolean inlineContent
-
-
Class org.apache.tika.parser.RegexCaptureParser
class RegexCaptureParser extends Object implements Serializable -
Class org.apache.tika.parser.RegexCaptureParserConfig
class RegexCaptureParserConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Class org.apache.tika.parser.SimplePasswordProvider
class SimplePasswordProvider extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
password
String password
-
-
Class org.apache.tika.parser.StatefulParser
class StatefulParser extends ParserDecorator implements Serializable
-
-
Package org.apache.tika.parser.apple
-
Class org.apache.tika.parser.apple.AppleSingleFileParser
class AppleSingleFileParser extends Object implements Serializable -
Class org.apache.tika.parser.apple.PListParser
class PListParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.asm
-
Class org.apache.tika.parser.asm.ClassParser
class ClassParser extends Object implements Serializable- serialVersionUID:
- -3531388963354454357L
-
-
Package org.apache.tika.parser.audio
-
Class org.apache.tika.parser.audio.AudioParser
class AudioParser extends Object implements Serializable- serialVersionUID:
- -6015684081240882695L
-
Class org.apache.tika.parser.audio.MidiParser
class MidiParser extends Object implements Serializable- serialVersionUID:
- 6343278584336189432L
-
-
Package org.apache.tika.parser.code
-
Class org.apache.tika.parser.code.SourceCodeParser
class SourceCodeParser extends AbstractEncodingDetectorParser implements Serializable- serialVersionUID:
- -4543476498190054160L
-
-
Package org.apache.tika.parser.crypto
-
Class org.apache.tika.parser.crypto.Pkcs7Parser
class Pkcs7Parser extends Object implements Serializable- serialVersionUID:
- -7310531559075115044L
-
Class org.apache.tika.parser.crypto.TSDParser
class TSDParser extends Object implements Serializable- serialVersionUID:
- 3268158344501763323L
-
-
Package org.apache.tika.parser.csv
-
Class org.apache.tika.parser.csv.TextAndCSVConfig
class TextAndCSVConfig extends Object implements Serializable-
Serialized Fields
-
delimiterToNameMap
Map<Character,
String> delimiterToNameMap -
markLimit
int markLimit
This is the mark limit in characters (not bytes) to read from the stream when classifying the stream as csv, tsv or txt. -
minConfidence
double minConfidence
minimum confidence score that there's enough evidence to determine csv/tsv vs. txt -
nameToDelimiterMap
Map<String,
Character> nameToDelimiterMap
-
-
-
Class org.apache.tika.parser.csv.TextAndCSVParser
class TextAndCSVParser extends AbstractEncodingDetectorParser implements Serializable-
Serialized Fields
-
defaultTextAndCSVConfig
TextAndCSVConfig defaultTextAndCSVConfig
-
-
-
-
Package org.apache.tika.parser.ctakes
-
Class org.apache.tika.parser.ctakes.CTAKESConfig
class CTAKESConfig extends Object implements Serializable- serialVersionUID:
- -1599741171775528923L
-
Serialized Fields
-
aeDescriptorPath
String aeDescriptorPath
-
annotationProps
CTAKESAnnotationProperty[] annotationProps
-
metadata
String[] metadata
-
prettyPrint
boolean prettyPrint
-
separatorChar
char separatorChar
-
serialize
boolean serialize
-
serializerType
CTAKESSerializer serializerType
-
stream
OutputStream stream
-
text
boolean text
-
UMLSPass
String UMLSPass
-
UMLSUser
String UMLSUser
-
-
Class org.apache.tika.parser.ctakes.CTAKESParser
class CTAKESParser extends ParserDecorator implements Serializable- serialVersionUID:
- -2313482748027097961L
-
-
Package org.apache.tika.parser.dbf
-
Class org.apache.tika.parser.dbf.DBFParser
class DBFParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.dgn
-
Class org.apache.tika.parser.dgn.DGN8Parser
class DGN8Parser extends Object implements Serializable
-
-
Package org.apache.tika.parser.dif
-
Class org.apache.tika.parser.dif.DIFParser
class DIFParser extends Object implements Serializable- serialVersionUID:
- 971505521275777826L
-
-
Package org.apache.tika.parser.dwg
-
Class org.apache.tika.parser.dwg.AbstractDWGParser
class AbstractDWGParser extends Object implements Serializable- serialVersionUID:
- 6261810259683381984L
-
Serialized Fields
-
defaultDwgParserConfig
DWGParserConfig defaultDwgParserConfig
-
-
Class org.apache.tika.parser.dwg.DWGParser
class DWGParser extends AbstractDWGParser implements Serializable- serialVersionUID:
- -7744232583079169119L
-
Class org.apache.tika.parser.dwg.DWGParserConfig
class DWGParserConfig extends Object implements Serializable- serialVersionUID:
- -7623524257255755725L
-
Serialized Fields
-
cleanDwgReadOutput
boolean cleanDwgReadOutput
-
cleanDwgReadOutputBatchSize
int cleanDwgReadOutputBatchSize
-
cleanDwgReadRegexToReplace
String cleanDwgReadRegexToReplace
-
cleanDwgReadReplaceWith
String cleanDwgReadReplaceWith
-
dwgReadExecutable
String dwgReadExecutable
-
dwgReadTimeout
long dwgReadTimeout
-
hasDwgRead
boolean hasDwgRead
-
-
Class org.apache.tika.parser.dwg.DWGParserConfig.RuntimeConfig
class RuntimeConfig extends DWGParserConfig implements Serializable -
Class org.apache.tika.parser.dwg.DWGReadParser
class DWGReadParser extends AbstractDWGParser implements Serializable- serialVersionUID:
- 7983127145030096837L
-
-
Package org.apache.tika.parser.envi
-
Class org.apache.tika.parser.envi.EnviHeaderParser
class EnviHeaderParser extends AbstractEncodingDetectorParser implements Serializable- serialVersionUID:
- -1479368523072408091L
-
-
Package org.apache.tika.parser.epub
-
Class org.apache.tika.parser.epub.EpubContentParser
class EpubContentParser extends Object implements Serializable -
Class org.apache.tika.parser.epub.EpubParser
class EpubParser extends Object implements Serializable- serialVersionUID:
- 215176772484050550L
-
Class org.apache.tika.parser.epub.OPFParser
class OPFParser extends DcXMLParser implements Serializable
-
-
Package org.apache.tika.parser.executable
-
Class org.apache.tika.parser.executable.ExecutableParser
class ExecutableParser extends Object implements Serializable- serialVersionUID:
- 32128791892482L
-
Class org.apache.tika.parser.executable.UniversalExecutableParser
class UniversalExecutableParser extends Object implements Serializable- serialVersionUID:
- 1L
-
-
Package org.apache.tika.parser.external
-
Class org.apache.tika.parser.external.ExternalParser
class ExternalParser extends Object implements Serializable-
Serialized Fields
-
checkCmd
String[] checkCmd
-
checkErrorCodes
int[] checkErrorCodes
-
checkResult
Boolean checkResult
-
commandLine
List<String> commandLine
-
config
ExternalParserConfig config
-
outputFileHandler
Parser outputFileHandler
-
stderrHandler
Parser stderrHandler
-
stdoutHandler
Parser stdoutHandler
-
supportedTypes
Set<MediaType> supportedTypes
-
-
-
Class org.apache.tika.parser.external.ExternalParserConfig
class ExternalParserConfig extends Object implements Serializable- serialVersionUID:
- 2L
-
Serialized Fields
-
checkCommandLine
List<String> checkCommandLine
-
checkErrorCodes
List<Integer> checkErrorCodes
-
commandLine
List<String> commandLine
-
contentSource
String contentSource
-
maxStdErr
int maxStdErr
-
maxStdOut
int maxStdOut
-
outputFileHandler
Parser outputFileHandler
-
returnStderr
boolean returnStderr
-
returnStdout
boolean returnStdout
-
stderrHandler
Parser stderrHandler
-
stdoutHandler
Parser stdoutHandler
-
supportedTypes
List<String> supportedTypes
-
timeoutMs
long timeoutMs
-
-
-
Package org.apache.tika.parser.feed
-
Class org.apache.tika.parser.feed.FeedParser
class FeedParser extends Object implements Serializable- serialVersionUID:
- -3785361933034525186L
-
-
Package org.apache.tika.parser.font
-
Class org.apache.tika.parser.font.AdobeFontMetricParser
class AdobeFontMetricParser extends Object implements Serializable- serialVersionUID:
- -4820306522217196835L
-
Class org.apache.tika.parser.font.TrueTypeParser
class TrueTypeParser extends Object implements Serializable- serialVersionUID:
- 44788554612243032L
-
-
Package org.apache.tika.parser.gdal
-
Class org.apache.tika.parser.gdal.GDALParser
class GDALParser extends Object implements Serializable- serialVersionUID:
- -3869130527323941401L
-
Serialized Fields
-
command
String command
-
maxStdErr
int maxStdErr
-
maxStdOut
int maxStdOut
-
timeoutMs
long timeoutMs
-
-
-
Package org.apache.tika.parser.geo.topic
-
Class org.apache.tika.parser.geo.topic.GeoParser
class GeoParser extends Object implements Serializable- serialVersionUID:
- -2241391757440215491L
-
Serialized Fields
-
available
boolean available
-
defaultConfig
GeoParserConfig defaultConfig
-
gazetteerClient
GeoGazetteerClient gazetteerClient
-
initialized
boolean initialized
-
modelUrl
URL modelUrl
-
nameFinder
opennlp.tools.namefind.NameFinderME nameFinder
-
-
Class org.apache.tika.parser.geo.topic.GeoParserConfig
class GeoParserConfig extends Object implements Serializable- serialVersionUID:
- -3167692634278575818L
-
Class org.apache.tika.parser.geo.topic.GeoParserConfig.RuntimeConfig
class RuntimeConfig extends GeoParserConfig implements Serializable
-
-
Package org.apache.tika.parser.geo.topic.gazetteer
-
Class org.apache.tika.parser.geo.topic.gazetteer.Location
class Location extends Object implements Serializable- serialVersionUID:
- -59485448766406004L
-
-
Package org.apache.tika.parser.geoinfo
-
Class org.apache.tika.parser.geoinfo.GeographicInformationParser
class GeographicInformationParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.geopkg
-
Class org.apache.tika.parser.geopkg.GeoPkgParser
class GeoPkgParser extends SQLite3Parser implements Serializable- serialVersionUID:
- -752276948656079347L
-
-
Package org.apache.tika.parser.grib
-
Class org.apache.tika.parser.grib.GribParser
class GribParser extends Object implements Serializable- serialVersionUID:
- 7855458954474247655L
-
-
Package org.apache.tika.parser.hdf
-
Class org.apache.tika.parser.hdf.HDFParser
class HDFParser extends Object implements Serializable- serialVersionUID:
- 1091208208003437549L
-
-
Package org.apache.tika.parser.html
-
Exception org.apache.tika.parser.html.DataURISchemeParseException
class DataURISchemeParseException extends TikaException implements Serializable -
Class org.apache.tika.parser.html.HtmlEncodingDetector
class HtmlEncodingDetector extends Object implements Serializable-
Serialized Fields
-
defaultConfig
HtmlEncodingDetector.Config defaultConfig
-
-
-
Class org.apache.tika.parser.html.HtmlEncodingDetector.Config
class Config extends Object implements Serializable-
Serialized Fields
-
markLimit
int markLimit
-
-
-
Class org.apache.tika.parser.html.JSoupParser
class JSoupParser extends AbstractEncodingDetectorParser implements Serializable- serialVersionUID:
- 7895315240498733128L
-
Serialized Fields
-
extractScripts
boolean extractScripts
-
-
Class org.apache.tika.parser.html.JSoupParser.Config
class Config extends Object implements Serializable-
Serialized Fields
-
extractScripts
boolean extractScripts
-
-
-
-
Package org.apache.tika.parser.html.charsetdetector
-
Class org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
class StandardHtmlEncodingDetector extends Object implements Serializable-
Serialized Fields
-
markLimit
int markLimit
-
-
-
-
Package org.apache.tika.parser.html.charsetdetector.charsets
-
Exception org.apache.tika.parser.html.charsetdetector.charsets.XUserDefinedCharset.NotImplementedException
class NotImplementedException extends RuntimeException implements Serializable
-
-
Package org.apache.tika.parser.http
-
Class org.apache.tika.parser.http.HttpParser
class HttpParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.hwp
-
Class org.apache.tika.parser.hwp.HwpTextExtractorV5
class HwpTextExtractorV5 extends Object implements Serializable- serialVersionUID:
- 1L
-
Class org.apache.tika.parser.hwp.HwpV5Parser
class HwpV5Parser extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
extractor
HwpTextExtractorV5 extractor
-
-
-
Package org.apache.tika.parser.image
-
Class org.apache.tika.parser.image.AbstractImageParser
class AbstractImageParser extends Object implements Serializable -
Class org.apache.tika.parser.image.BPGParser
class BPGParser extends AbstractImageParser implements Serializable- serialVersionUID:
- -161736541253892772L
-
Serialized Fields
-
maxRecordLength
int maxRecordLength
-
-
Class org.apache.tika.parser.image.HeifParser
class HeifParser extends AbstractImageParser implements Serializable -
Class org.apache.tika.parser.image.ICNSParser
class ICNSParser extends Object implements Serializable- serialVersionUID:
- 922010233654248327L
-
Class org.apache.tika.parser.image.ImageParser
class ImageParser extends AbstractImageParser implements Serializable- serialVersionUID:
- 7852529269245520335L
-
Class org.apache.tika.parser.image.JpegParser
class JpegParser extends AbstractImageParser implements Serializable- serialVersionUID:
- -1355028253756234603L
-
Class org.apache.tika.parser.image.JXLParser
class JXLParser extends Object implements Serializable -
Class org.apache.tika.parser.image.PSDParser
class PSDParser extends Object implements Serializable- serialVersionUID:
- 883387734607994914L
-
Serialized Fields
-
defaultConfig
PSDParser.PSDParserConfig defaultConfig
-
-
Class org.apache.tika.parser.image.TiffParser
class TiffParser extends AbstractImageParser implements Serializable- serialVersionUID:
- -3941143576535464926L
-
Class org.apache.tika.parser.image.WebPParser
class WebPParser extends Object implements Serializable- serialVersionUID:
- -3941143576535464926L
-
-
Package org.apache.tika.parser.indesign
-
Class org.apache.tika.parser.indesign.IDMLParser
class IDMLParser extends Object implements Serializable-
Serialized Fields
-
masterSpreadCount
int masterSpreadCount
Internal master spread count. -
pageCount
int pageCount
Internal page count.
-
-
-
-
Package org.apache.tika.parser.iptc
-
Class org.apache.tika.parser.iptc.IptcAnpaParser
class IptcAnpaParser extends Object implements Serializable- serialVersionUID:
- -6062820170212879115L
-
Serialized Fields
-
FMT_ANPA_1312
int FMT_ANPA_1312
-
FMT_ANPA_UPI
int FMT_ANPA_UPI
-
FMT_ANPA_UPI_DL
int FMT_ANPA_UPI_DL
-
FMT_IPTC_7901
int FMT_IPTC_7901
-
FMT_IPTC_AP
int FMT_IPTC_AP
-
FMT_IPTC_BLM
int FMT_IPTC_BLM
-
FMT_IPTC_CHAR
int FMT_IPTC_CHAR
-
FMT_IPTC_NYT
int FMT_IPTC_NYT
-
FMT_IPTC_PHOTO
int FMT_IPTC_PHOTO
-
FMT_IPTC_RTR
int FMT_IPTC_RTR
-
FMT_NITF
int FMT_NITF
-
FMT_NITF_RB
int FMT_NITF_RB
-
FMT_NITF_TT
int FMT_NITF_TT
-
FORMAT
int FORMAT
-
-
-
Package org.apache.tika.parser.isatab
-
Class org.apache.tika.parser.isatab.ISArchiveParser
class ISArchiveParser extends Object implements Serializable- serialVersionUID:
- 3640809327541300229L
-
-
Package org.apache.tika.parser.iwork
-
Class org.apache.tika.parser.iwork.IWorkPackageParser
class IWorkPackageParser extends Object implements Serializable- serialVersionUID:
- -2160322853809682372L
-
-
Package org.apache.tika.parser.iwork.iwana
-
Class org.apache.tika.parser.iwork.iwana.IWork13PackageParser
class IWork13PackageParser extends Object implements Serializable -
Class org.apache.tika.parser.iwork.iwana.IWork18PackageParser
class IWork18PackageParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.jdbc
-
Class org.apache.tika.parser.jdbc.AbstractDBParser
class AbstractDBParser extends Object implements Serializable-
Serialized Fields
-
connection
Connection connection
-
-
-
-
Package org.apache.tika.parser.journal
-
Class org.apache.tika.parser.journal.JournalParser
class JournalParser extends Object implements Serializable- serialVersionUID:
- 4664255544154296438L
-
-
Package org.apache.tika.parser.mail
-
Class org.apache.tika.parser.mail.RFC822Parser
class RFC822Parser extends Object implements Serializable- serialVersionUID:
- -5504243905998074168L
-
Serialized Fields
-
defaultConfig
RFC822Parser.Config defaultConfig
-
detector
Detector detector
-
-
-
Package org.apache.tika.parser.mat
-
Class org.apache.tika.parser.mat.MatParser
class MatParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.mbox
-
Class org.apache.tika.parser.mbox.MboxParser
class MboxParser extends Object implements Serializable- serialVersionUID:
- -1762689436731160661L
-
-
Package org.apache.tika.parser.microsoft
-
Class org.apache.tika.parser.microsoft.AbstractOfficeParser
class AbstractOfficeParser extends Object implements Serializable-
Serialized Fields
-
defaultOfficeParserConfig
OfficeParserConfig defaultOfficeParserConfig
-
-
-
Class org.apache.tika.parser.microsoft.EMFParser
class EMFParser extends Object implements Serializable -
Class org.apache.tika.parser.microsoft.JackcessParser
class JackcessParser extends Object implements Serializable- serialVersionUID:
- -752276948656079347L
-
Serialized Fields
-
locale
Locale locale
-
-
Class org.apache.tika.parser.microsoft.MSOwnerFileParser
class MSOwnerFileParser extends Object implements Serializable- serialVersionUID:
- -752276948656079347L
-
Class org.apache.tika.parser.microsoft.OfficeParser
class OfficeParser extends AbstractOfficeParser implements Serializable- serialVersionUID:
- 7393462244028653479L
-
Class org.apache.tika.parser.microsoft.OfficeParserConfig
class OfficeParserConfig extends Object implements Serializable-
Serialized Fields
-
concatenatePhoneticRuns
boolean concatenatePhoneticRuns
-
dateOverrideFormat
String dateOverrideFormat
-
extractMacros
boolean extractMacros
-
includeDeletedContent
boolean includeDeletedContent
-
includeGlossary
boolean includeGlossary
-
includeHeadersAndFooters
boolean includeHeadersAndFooters
-
includeMissingRows
boolean includeMissingRows
-
includeMoveFromContent
boolean includeMoveFromContent
-
includeShapeBasedContent
boolean includeShapeBasedContent
-
includeSlideMasterContent
boolean includeSlideMasterContent
-
includeSlideNotes
boolean includeSlideNotes
-
maxOverride
int maxOverride
-
preferAlternateContentChoice
boolean preferAlternateContentChoice
-
rtfEmbeddedMaxBytesInKb
int rtfEmbeddedMaxBytesInKb
Maximum bytes per embedded object/pict when extracting from RTF within MSG files. Since embedded data is streamed to disk (not held in memory), the default is 2 GB. Set to -1 for unlimited. -
writeSelectHeadersInBody
boolean writeSelectHeadersInBody
-
-
-
Class org.apache.tika.parser.microsoft.OldExcelParser
class OldExcelParser extends Object implements Serializable- serialVersionUID:
- 4611820730372823452L
-
Class org.apache.tika.parser.microsoft.TikaExcelGeneralFormat
class TikaExcelGeneralFormat extends Format implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
decimalFormat
DecimalFormat decimalFormat
-
decimalSymbols
DecimalFormatSymbols decimalSymbols
-
integerFormat
DecimalFormat integerFormat
-
scientificFormat
DecimalFormat scientificFormat
-
-
Class org.apache.tika.parser.microsoft.TNEFParser
class TNEFParser extends Object implements Serializable- serialVersionUID:
- 4611820730372823452L
-
Class org.apache.tika.parser.microsoft.WMFParser
class WMFParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.microsoft.activemime
-
Class org.apache.tika.parser.microsoft.activemime.ActiveMimeParser
class ActiveMimeParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.microsoft.chm
-
Class org.apache.tika.parser.microsoft.chm.ChmItsfHeader
class ChmItsfHeader extends Object implements Serializable- serialVersionUID:
- 2215291838533213826L
-
Serialized Fields
-
currentPlace
int currentPlace
-
data_offset
long data_offset
-
dataRemained
int dataRemained
-
dir_len
long dir_len
-
dir_offset
long dir_offset
-
dir_uuid
byte[] dir_uuid
-
header_len
int header_len
-
lang_id
long lang_id
-
last_modified
long last_modified
-
signature
byte[] signature
-
stream_uuid
byte[] stream_uuid
-
unknown_000c
int unknown_000c
-
unknown_len
long unknown_len
-
unknown_offset
long unknown_offset
-
version
int version
-
-
Class org.apache.tika.parser.microsoft.chm.ChmItspHeader
class ChmItspHeader extends Object implements Serializable- serialVersionUID:
- 1962394421998181341L
-
Serialized Fields
-
block_len
long block_len
-
blockidx_intvl
int blockidx_intvl
-
currentPlace
int currentPlace
-
dataRemained
int dataRemained
-
header_len
int header_len
-
index_depth
int index_depth
-
index_head
int index_head
-
index_root
int index_root
-
lang_id
long lang_id
-
num_blocks
long num_blocks
-
signature
byte[] signature
-
system_uuid
byte[] system_uuid
-
unknown_000c
int unknown_000c
-
unknown_0024
int unknown_0024
-
unknown_002c
int unknown_002c
-
unknown_0044
byte[] unknown_0044
-
version
int version
-
-
Class org.apache.tika.parser.microsoft.chm.ChmLzxcControlData
class ChmLzxcControlData extends Object implements Serializable- serialVersionUID:
- -7897854774939631565L
-
Serialized Fields
-
currentPlace
int currentPlace
-
dataRemained
int dataRemained
-
resetInterval
long resetInterval
-
signature
byte[] signature
-
size
long size
-
unknown_18
long unknown_18
-
version
long version
-
windowSize
long windowSize
-
windowsPerReset
long windowsPerReset
-
-
Class org.apache.tika.parser.microsoft.chm.ChmLzxcResetTable
class ChmLzxcResetTable extends Object implements Serializable- serialVersionUID:
- -8209574429411707460L
-
Serialized Fields
-
block_address
long[] block_address
-
block_count
long block_count
-
block_len
long block_len
-
compressed_len
long compressed_len
-
currentPlace
int currentPlace
-
dataRemained
int dataRemained
-
table_offset
long table_offset
-
uncompressed_len
long uncompressed_len
-
unknown
long unknown
-
version
long version
-
-
Class org.apache.tika.parser.microsoft.chm.ChmParser
class ChmParser extends Object implements Serializable- serialVersionUID:
- 5938777307516469802L
-
Exception org.apache.tika.parser.microsoft.chm.ChmParsingException
class ChmParsingException extends TikaException implements Serializable- serialVersionUID:
- 6497936044733665210L
-
Class org.apache.tika.parser.microsoft.chm.ChmPmgiHeader
class ChmPmgiHeader extends Object implements Serializable- serialVersionUID:
- -2092282339894303701L
-
Serialized Fields
-
currentPlace
int currentPlace
-
dataRemained
int dataRemained
-
free_space
long free_space
-
signature
byte[] signature
-
-
Class org.apache.tika.parser.microsoft.chm.ChmPmglHeader
class ChmPmglHeader extends Object implements Serializable- serialVersionUID:
- -6139486487475923593L
-
Serialized Fields
-
block_next
int block_next
-
block_prev
int block_prev
-
currentPlace
int currentPlace
-
dataRemained
int dataRemained
-
free_space
long free_space
-
signature
byte[] signature
-
unknown_0008
long unknown_0008
-
-
-
Package org.apache.tika.parser.microsoft.libpst
-
Class org.apache.tika.parser.microsoft.libpst.LibPstParser
class LibPstParser extends Object implements Serializable-
Serialized Fields
-
defaultConfig
LibPstParserConfig defaultConfig
-
-
-
Class org.apache.tika.parser.microsoft.libpst.LibPstParserConfig
class LibPstParserConfig extends Object implements Serializable-
Serialized Fields
-
includeDeleted
boolean includeDeleted
-
isDebug
boolean isDebug
In initial tests, setting this to true resulted in more emails being extracted. It did dramatically slow down processing time. :( -
maxEmails
int maxEmails
max emails to process. Will process everything if this value is < 0 -
processEmailAsMsg
boolean processEmailAsMsg
Should readpst also output msg files for processing. In an initial test, not as many attachments were extracted from msg files. Not yet clear if that is a POI limitation or a problem with libpst -
readPstPath
String readPstPath
This should include the path up to but not including 'readpst', e.g. "C:\my_bin" where readpst is at "C:\my_bin\readpst" -
timeoutSeconds
long timeoutSeconds
-
-
-
Class org.apache.tika.parser.microsoft.libpst.LibPstParserConfig.RuntimeConfig
class RuntimeConfig extends LibPstParserConfig implements Serializable
-
-
Package org.apache.tika.parser.microsoft.onenote
-
Class org.apache.tika.parser.microsoft.onenote.OneNoteParser
class OneNoteParser extends Object implements Serializable- serialVersionUID:
- -5504243905998074168L
-
Serialized Fields
-
options
OneNoteTreeWalkerOptions options
-
-
Class org.apache.tika.parser.microsoft.onenote.OneNoteTreeWalkerOptions
class OneNoteTreeWalkerOptions extends Object implements Serializable-
Serialized Fields
-
crawlAllFileNodesFromRoot
boolean crawlAllFileNodesFromRoot
-
onlyLatestRevision
boolean onlyLatestRevision
-
utf16PropertiesToPrint
Set<OneNotePropertyEnum> utf16PropertiesToPrint
-
-
-
-
Package org.apache.tika.parser.microsoft.onenote.fsshttpb.exception
-
Exception org.apache.tika.parser.microsoft.onenote.fsshttpb.exception.DataElementParseErrorException
class DataElementParseErrorException extends RuntimeException implements Serializable-
Serialized Fields
-
index
int index
-
-
-
-
Package org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj
-
Exception org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.StreamObjectParseErrorException
class StreamObjectParseErrorException extends RuntimeException implements Serializable-
Serialized Fields
-
index
int index
-
streamObjectTypeName
String streamObjectTypeName
-
-
-
-
Package org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned
-
Class org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.UByte
class UByte extends UNumber implements Serializable- serialVersionUID:
- -6821055240959745390L
-
Serialization Methods
-
readResolve
Replace version read through deserialization with cached version. Note that this does not use theUByte.valueOfUnchecked(short)as we have no guarantee that the value from the stream is valid.- Throws:
ObjectStreamException
-
-
Serialized Fields
-
value
short value
The value modelling the content of thisunsigned byte
-
-
Class org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.UInteger
class UInteger extends UNumber implements Serializable- serialVersionUID:
- -6821055240959745390L
-
Serialization Methods
-
readResolve
Replace version read through deserialization with cached version.- Throws:
ObjectStreamException
-
-
Serialized Fields
-
value
long value
The value modelling the content of thisunsigned int
-
-
Class org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.ULong
class ULong extends UNumber implements Serializable- serialVersionUID:
- -6821055240959745390L
-
Serialized Fields
-
value
long value
The value modelling the content of thisunsigned long
-
-
Class org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.UNumber
class UNumber extends Number implements Serializable- serialVersionUID:
- -7666221938815339843L
-
Class org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.UShort
class UShort extends UNumber implements Serializable- serialVersionUID:
- -6821055240959745390L
-
Serialized Fields
-
value
int value
The value modelling the content of thisunsigned short
-
-
-
Package org.apache.tika.parser.microsoft.ooxml
-
Class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
class OOXMLParser extends AbstractOfficeParser implements Serializable- serialVersionUID:
- 6535995710857776481L
-
-
Package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006
-
Class org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
class Word2006MLParser extends AbstractOfficeParser implements Serializable
-
-
Package org.apache.tika.parser.microsoft.pst
-
Class org.apache.tika.parser.microsoft.pst.OutlookPSTParser
class OutlookPSTParser extends Object implements Serializable- serialVersionUID:
- 620998217748364063L
-
Class org.apache.tika.parser.microsoft.pst.PSTMailItemParser
class PSTMailItemParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.microsoft.rtf
-
Class org.apache.tika.parser.microsoft.rtf.RTFParser
class RTFParser extends Object implements Serializable- serialVersionUID:
- -4165069489372320313L
-
Serialized Fields
-
ignoreListMarkup
boolean ignoreListMarkup
-
memoryLimitInKb
int memoryLimitInKb
-
-
-
Package org.apache.tika.parser.microsoft.xml
-
Class org.apache.tika.parser.microsoft.xml.AbstractXML2003Parser
class AbstractXML2003Parser extends Object implements Serializable -
Class org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser
class SpreadsheetMLParser extends AbstractXML2003Parser implements Serializable -
Class org.apache.tika.parser.microsoft.xml.WordMLParser
class WordMLParser extends AbstractXML2003Parser implements Serializable
-
-
Package org.apache.tika.parser.mif
-
Class org.apache.tika.parser.mif.MIFParser
class MIFParser extends AbstractEncodingDetectorParser implements Serializable
-
-
Package org.apache.tika.parser.mp3
-
Class org.apache.tika.parser.mp3.Mp3Parser
class Mp3Parser extends Object implements Serializable- serialVersionUID:
- 8537074922934844370L
-
-
Package org.apache.tika.parser.mp4
-
Class org.apache.tika.parser.mp4.MP4Parser
class MP4Parser extends Object implements Serializable- serialVersionUID:
- 84011216792285L
-
-
Package org.apache.tika.parser.multiple
-
Class org.apache.tika.parser.multiple.AbstractMultipleParser
class AbstractMultipleParser extends Object implements Serializable- serialVersionUID:
- 5383668090329836559L
-
Serialized Fields
-
offeredTypes
Set<MediaType> offeredTypes
Computed list of Mime Types to offer, which is all those in common between the parsers. For explicit mimetypes only, use aParserDecorator -
parsers
Collection<? extends Parser> parsers
List of the multiple parsers to try. -
policy
AbstractMultipleParser.MetadataPolicy policy
How we should handle metadata clashes -
registry
MediaTypeRegistry registry
Media type registry.
-
-
Class org.apache.tika.parser.multiple.FallbackParser
class FallbackParser extends AbstractMultipleParser implements Serializable- serialVersionUID:
- 5844409020977206167L
-
Class org.apache.tika.parser.multiple.SupplementingParser
class SupplementingParser extends AbstractMultipleParser implements Serializable- serialVersionUID:
- 313179254565350994L
-
-
Package org.apache.tika.parser.ner
-
Class org.apache.tika.parser.ner.NamedEntityParser
class NamedEntityParser extends Object implements Serializable-
Serialized Fields
-
available
boolean available
-
initialized
boolean initialized
-
nerChain
List<NERecogniser> nerChain
-
secondaryParser
Tika secondaryParser
-
-
-
-
Package org.apache.tika.parser.netcdf
-
Class org.apache.tika.parser.netcdf.NetCDFParser
class NetCDFParser extends Object implements Serializable- serialVersionUID:
- -5940938274907708665L
-
-
Package org.apache.tika.parser.ocr
-
Class org.apache.tika.parser.ocr.TesseractOCRConfig
class TesseractOCRConfig extends Object implements Serializable- serialVersionUID:
- -4861942486845757891L
-
Serialized Fields
-
applyRotation
boolean applyRotation
-
colorspace
String colorspace
-
density
int density
-
depth
int depth
-
enableImagePreprocessing
boolean enableImagePreprocessing
-
filter
String filter
-
imageMagickPath
String imageMagickPath
-
inlineContent
boolean inlineContent
-
language
String language
-
maxFileSizeToOcr
long maxFileSizeToOcr
-
minFileSizeToOcr
long minFileSizeToOcr
-
otherTesseractConfig
HashMap<String,
String> otherTesseractConfig -
outputType
TesseractOCRConfig.OUTPUT_TYPE outputType
-
pageSegMode
String pageSegMode
-
pageSeparator
String pageSeparator
-
preloadLangs
boolean preloadLangs
-
preserveInterwordSpacing
boolean preserveInterwordSpacing
-
resize
int resize
-
skipOcr
boolean skipOcr
-
tessdataPath
String tessdataPath
-
tesseractPath
String tesseractPath
-
timeoutSeconds
int timeoutSeconds
-
-
Class org.apache.tika.parser.ocr.TesseractOCRConfig.RuntimeConfig
class RuntimeConfig extends TesseractOCRConfig implements Serializable -
Class org.apache.tika.parser.ocr.TesseractOCRParser
class TesseractOCRParser extends AbstractExternalProcessParser implements Serializable- serialVersionUID:
- -8167538283213097265L
-
Serialized Fields
-
defaultConfig
TesseractOCRConfig defaultConfig
-
hasCheckedForImageMagick
boolean hasCheckedForImageMagick
-
hasImageMagick
boolean hasImageMagick
-
hasTesseract
boolean hasTesseract
-
imagePreprocessor
org.apache.tika.parser.ocr.ImagePreprocessor imagePreprocessor
-
langs
Set<String> langs
-
-
-
Package org.apache.tika.parser.ocr.tess4j
-
Class org.apache.tika.parser.ocr.tess4j.Tess4JConfig
class Tess4JConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
dataPath
String dataPath
Path to the tessdata directory containing language data files. If empty, tess4j will try to find the tessdata directory automatically. -
dpi
int dpi
DPI for image rendering. Default is 300. -
language
String language
Language dictionary to be used. Default is "eng". -
maxFileSizeToOcr
long maxFileSizeToOcr
Maximum file size (in bytes) to submit to OCR. Default is 50 MB. -
maxImagePixels
long maxImagePixels
Maximum total pixels (width × height) allowed for an image before OCR is skipped. This prevents OOM from decompressing pathologically large images (e.g., a 30,000 × 30,000 image would require ~3.6 GB of heap as a BufferedImage).Default is 100,000,000 (100 megapixels, ~10,000 × 10,000). Set to
-1for no limit (not recommended). -
minFileSizeToOcr
long minFileSizeToOcr
Minimum file size (in bytes) to submit to OCR. Default is 0. -
nativeLibPath
String nativeLibPath
Path to the directory containing native Tesseract and Leptonica shared libraries (e.g.,libtesseract.dylib,libtesseract.so).On macOS with Homebrew, this is typically
/opt/homebrew/lib. On Linux, it may be/usr/libor/usr/local/lib.If empty, JNA will search the default system library paths.
-
ocrEngineMode
int ocrEngineMode
Tesseract OCR Engine mode. Default is 3 (Default, based on what is available).- 0 = Original Tesseract only.
- 1 = Neural nets LSTM only.
- 2 = Tesseract + LSTM.
- 3 = Default, based on what is available.
-
pageSegMode
int pageSegMode
Tesseract page segmentation mode. Default is 1.- 0 = Orientation and script detection (OSD) only.
- 1 = Automatic page segmentation with OSD.
- 3 = Fully automatic page segmentation, but no OSD. (Default for Tesseract)
- 6 = Assume a single uniform block of text.
-
poolSize
int poolSize
Number of Tesseract instances to keep in the pool. Default is 2. -
skipOcr
boolean skipOcr
Runtime switch to turn off OCR. -
timeoutSeconds
int timeoutSeconds
Maximum time (in seconds) to wait for a Tesseract instance from the pool. Default is 120.
-
-
Class org.apache.tika.parser.ocr.tess4j.Tess4JConfig.RuntimeConfig
class RuntimeConfig extends Tess4JConfig implements Serializable -
Class org.apache.tika.parser.ocr.tess4j.Tess4JParser
class Tess4JParser extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
defaultConfig
Tess4JConfig defaultConfig
-
initialized
boolean initialized
-
-
-
Package org.apache.tika.parser.ocrencode
-
Class org.apache.tika.parser.ocrencode.EncodeOCRConfig
class EncodeOCRConfig extends Object implements Serializable- serialVersionUID:
- -1761942486845717891L
-
Serialized Fields
-
inlineContent
boolean inlineContent
-
maxFileSizeToOcr
long maxFileSizeToOcr
-
maxImagesToOcr
int maxImagesToOcr
-
minFileSizeToOcr
long minFileSizeToOcr
-
skipOcr
boolean skipOcr
-
-
Class org.apache.tika.parser.ocrencode.EncodeOCRParser
class EncodeOCRParser extends AbstractExternalProcessParser implements Serializable- serialVersionUID:
- -8167538283213097266L
-
Serialized Fields
-
defaultConfig
EncodeOCRConfig defaultConfig
-
-
-
Package org.apache.tika.parser.odf
-
Class org.apache.tika.parser.odf.FlatOpenDocumentParser
class FlatOpenDocumentParser extends Object implements Serializable- serialVersionUID:
- -8739250869531737584L
-
Serialized Fields
-
extractMacros
boolean extractMacros
-
-
Class org.apache.tika.parser.odf.OpenDocumentContentParser
class OpenDocumentContentParser extends Object implements Serializable -
Class org.apache.tika.parser.odf.OpenDocumentMetaParser
class OpenDocumentMetaParser extends XMLParser implements Serializable- serialVersionUID:
- -8739250869531737584L
-
Class org.apache.tika.parser.odf.OpenDocumentParser
class OpenDocumentParser extends Object implements Serializable- serialVersionUID:
- -6410276875438618287L
-
-
Package org.apache.tika.parser.ogg
-
Class org.apache.tika.parser.ogg.FlacParser
class FlacParser extends AbstractParser implements Serializable- serialVersionUID:
- -7546577301474546694L
-
Class org.apache.tika.parser.ogg.OggAudioParser
class OggAudioParser extends AbstractParser implements Serializable- serialVersionUID:
- 5168743829615945633L
-
Class org.apache.tika.parser.ogg.OggParser
class OggParser extends AbstractParser implements Serializable- serialVersionUID:
- -5686095376587813226L
-
Class org.apache.tika.parser.ogg.OpusParser
class OpusParser extends OggAudioParser implements Serializable- serialVersionUID:
- 5904981674814527529L
-
Class org.apache.tika.parser.ogg.SpeexParser
class SpeexParser extends OggAudioParser implements Serializable- serialVersionUID:
- 5904981674814527529L
-
Class org.apache.tika.parser.ogg.TheoraParser
class TheoraParser extends AbstractParser implements Serializable- serialVersionUID:
- -5459916822092342944L
-
Class org.apache.tika.parser.ogg.VorbisParser
class VorbisParser extends OggAudioParser implements Serializable- serialVersionUID:
- 5904981674814527529L
-
-
Package org.apache.tika.parser.pdf
-
Class org.apache.tika.parser.pdf.OcrConfig
class OcrConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
dpi
int dpi
-
imageFormat
OcrConfig.ImageFormat imageFormat
-
imageQuality
float imageQuality
-
imageType
OcrConfig.ImageType imageType
-
maxImagePixels
long maxImagePixels
Maximum total pixels (width × height) allowed for a rendered page image before OCR is skipped for that page. This prevents OOM from rendering pathologically large PDF pages (e.g., architectural drawings, maps) via PDFBox's in-process renderer.When using the Poppler renderer, prefer
maxScaleToonPopplerRendererinstead — it prevents the large image from ever being created. This limit is the safety net for the PDFBox rendering path.Default is 100,000,000 (100 megapixels, roughly 10,000 × 10,000). Set to
-1for no limit (not recommended). -
maxPagesToOcr
int maxPagesToOcr
Maximum number of pages to OCR per document. Pages beyond this limit are processed for text extraction only (if applicable) but not rendered or sent to OCR.Default is
-1(no limit — all pages are eligible for OCR). -
renderingStrategy
OcrConfig.RenderingStrategy renderingStrategy
-
strategy
OcrConfig.Strategy strategy
-
strategyAuto
OcrConfig.StrategyAuto strategyAuto
-
-
Class org.apache.tika.parser.pdf.OcrConfig.StrategyAuto
class StrategyAuto extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
totalCharsPerPage
int totalCharsPerPage
-
unmappedUnicodeCharsPerPage
float unmappedUnicodeCharsPerPage
-
-
Class org.apache.tika.parser.pdf.PDFParser
class PDFParser extends Object implements Serializable- serialVersionUID:
- -752276948656079347L
-
Serialized Fields
-
defaultConfig
PDFParserConfig defaultConfig
-
renderer
Renderer renderer
-
-
Class org.apache.tika.parser.pdf.PDFParserConfig
class PDFParserConfig extends Object implements Serializable- serialVersionUID:
- 6492570218190936986L
-
Serialized Fields
-
accessCheckMode
PDFParserConfig.AccessCheckMode accessCheckMode
-
averageCharTolerance
Float averageCharTolerance
-
catchIntermediateIOExceptions
boolean catchIntermediateIOExceptions
-
detectAngles
boolean detectAngles
-
dropThreshold
float dropThreshold
-
enableAutoSpace
boolean enableAutoSpace
-
extractAcroFormContent
boolean extractAcroFormContent
-
extractActions
boolean extractActions
-
extractAnnotationText
boolean extractAnnotationText
-
extractBookmarksText
boolean extractBookmarksText
-
extractFontNames
boolean extractFontNames
-
extractIncrementalUpdateInfo
boolean extractIncrementalUpdateInfo
-
extractInlineImageMetadataOnly
boolean extractInlineImageMetadataOnly
-
extractInlineImages
boolean extractInlineImages
-
extractMarkedContent
boolean extractMarkedContent
-
extractUniqueInlineImagesOnly
boolean extractUniqueInlineImagesOnly
-
ifXFAExtractOnlyXFA
boolean ifXFAExtractOnlyXFA
-
ignoreContentStreamSpaceGlyphs
boolean ignoreContentStreamSpaceGlyphs
-
imageGraphicsEngineFactory
ImageGraphicsEngineFactory imageGraphicsEngineFactory
-
imageStrategy
PDFParserConfig.IMAGE_STRATEGY imageStrategy
Should the entire document be rendered? -
maxIncrementalUpdates
int maxIncrementalUpdates
-
maxMainMemoryBytes
long maxMainMemoryBytes
-
ocr
OcrConfig ocr
-
parseIncrementalUpdates
boolean parseIncrementalUpdates
-
setKCMS
boolean setKCMS
-
sortByPosition
boolean sortByPosition
-
spacingTolerance
Float spacingTolerance
-
suppressDuplicateOverlappingText
boolean suppressDuplicateOverlappingText
-
throwOnEncryptedPayload
boolean throwOnEncryptedPayload
-
-
-
Package org.apache.tika.parser.pdf.image
-
Class org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory
class ImageGraphicsEngineFactory extends Object implements Serializable
-
-
Package org.apache.tika.parser.pkg
-
Class org.apache.tika.parser.pkg.AbstractArchiveParser
class AbstractArchiveParser extends AbstractEncodingDetectorParser implements Serializable -
Class org.apache.tika.parser.pkg.CompressorParser
class CompressorParser extends Object implements Serializable- serialVersionUID:
- 2793565792967222459L
-
Serialized Fields
-
defaultConfig
CompressorParser.Config defaultConfig
-
-
Class org.apache.tika.parser.pkg.PackageParser
class PackageParser extends AbstractArchiveParser implements Serializable- serialVersionUID:
- -5331043266963888708L
-
Class org.apache.tika.parser.pkg.RarParser
class RarParser extends Object implements Serializable- serialVersionUID:
- 6157727985054451501L
-
Class org.apache.tika.parser.pkg.SevenZParser
class SevenZParser extends AbstractArchiveParser implements Serializable- serialVersionUID:
- -5331043266963888710L
-
Class org.apache.tika.parser.pkg.UnrarParser
class UnrarParser extends Object implements Serializable- serialVersionUID:
- 6157727985054451501L
-
Serialized Fields
-
timeoutMillis
long timeoutMillis
-
-
Class org.apache.tika.parser.pkg.ZipParser
class ZipParser extends AbstractArchiveParser implements Serializable- serialVersionUID:
- -5331043266963888709L
-
Serialized Fields
-
defaultConfig
ZipParserConfig defaultConfig
-
-
Class org.apache.tika.parser.pkg.ZipParserConfig
class ZipParserConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
detectCharsetsInEntryNames
boolean detectCharsetsInEntryNames
Whether to run charset detection on entry names to handle non-Unicode filenames. Default is true. -
entryEncoding
Charset entryEncoding
The charset to use for reading entry names. If null, the parser will use the platform default or auto-detect based onZipParserConfig.detectCharsetsInEntryNames. -
integrityCheck
boolean integrityCheck
Whether to perform integrity checking by comparing the central directory (read via file-based access) against local file headers (read via streaming). This can detect:- Duplicate entry names (potential attack vector)
- Entries in central directory but not in local headers
- Entries in local headers but not in central directory
-
-
-
Package org.apache.tika.parser.prt
-
Class org.apache.tika.parser.prt.PRTParser
class PRTParser extends Object implements Serializable- serialVersionUID:
- 4659638314375035178L
-
-
Package org.apache.tika.parser.sas
-
Class org.apache.tika.parser.sas.SAS7BDATParser
class SAS7BDATParser extends Object implements Serializable- serialVersionUID:
- -2775485539937983150L
-
-
Package org.apache.tika.parser.sqlite3
-
Class org.apache.tika.parser.sqlite3.SQLite3DBParser
class SQLite3DBParser extends AbstractDBParser implements Serializable-
Serialized Fields
-
tmpFile
Path tmpFile
-
-
-
Class org.apache.tika.parser.sqlite3.SQLite3Parser
class SQLite3Parser extends Object implements Serializable- serialVersionUID:
- -752276948656079347L
-
-
Package org.apache.tika.parser.strings
-
Class org.apache.tika.parser.strings.Latin1StringsParser
class Latin1StringsParser extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
inPos
int inPos
The position into the input buffer. -
input
byte[] input
The input buffer. -
inSize
int inSize
The number of bytes into the input buffer. -
minSize
int minSize
The minimum size of a character sequence to be extracted. -
outPos
int outPos
The current position into the output buffer. -
output
byte[] output
The output buffer. -
tmpPos
int tmpPos
The temporary position into the output buffer. -
xhtml
XHTMLContentHandler xhtml
The output content handler.
-
-
Class org.apache.tika.parser.strings.StringsConfig
class StringsConfig extends Object implements Serializable- serialVersionUID:
- -1465227101645003594L
-
Serialized Fields
-
encoding
StringsEncoding encoding
-
filePath
String filePath
-
minLength
int minLength
-
stringsPath
String stringsPath
-
timeoutSeconds
int timeoutSeconds
-
-
Class org.apache.tika.parser.strings.StringsConfig.RuntimeConfig
class RuntimeConfig extends StringsConfig implements Serializable -
Class org.apache.tika.parser.strings.StringsParser
class StringsParser extends Object implements Serializable- serialVersionUID:
- 802566634661575025L
-
Serialized Fields
-
defaultConfig
StringsConfig defaultConfig
-
fileCommandDetector
FileCommandDetector fileCommandDetector
-
hasEncodingOption
boolean hasEncodingOption
-
stringsPresent
boolean stringsPresent
-
-
-
Package org.apache.tika.parser.tmx
-
Class org.apache.tika.parser.tmx.TMXParser
class TMXParser extends Object implements Serializable- serialVersionUID:
- 2305588935434276452L
-
-
Package org.apache.tika.parser.transcribe.aws
-
Class org.apache.tika.parser.transcribe.aws.AmazonTranscribe
class AmazonTranscribe extends Object implements Serializable-
Serialized Fields
-
amazonS3
software.amazon.awssdk.services.s3.S3Client amazonS3
-
amazonTranscribeAsync
software.amazon.awssdk.services.transcribe.TranscribeAsyncClient amazonTranscribeAsync
-
credsProvider
software.amazon.awssdk.auth.credentials.StaticCredentialsProvider credsProvider
-
defaultConfig
AmazonTranscribeConfig defaultConfig
-
isAvailable
boolean isAvailable
-
-
-
Class org.apache.tika.parser.transcribe.aws.AmazonTranscribeConfig
class AmazonTranscribeConfig extends Object implements Serializable -
Class org.apache.tika.parser.transcribe.aws.AmazonTranscribeConfig.RuntimeConfig
class RuntimeConfig extends AmazonTranscribeConfig implements Serializable
-
-
Package org.apache.tika.parser.txt
-
Class org.apache.tika.parser.txt.Icu4jEncodingDetector
class Icu4jEncodingDetector extends Object implements Serializable-
Serialized Fields
-
defaultConfig
Icu4jEncodingDetector.Config defaultConfig
-
-
-
Class org.apache.tika.parser.txt.Icu4jEncodingDetector.Config
class Config extends Object implements Serializable -
Class org.apache.tika.parser.txt.TXTParser
class TXTParser extends AbstractEncodingDetectorParser implements Serializable- serialVersionUID:
- -6656102320836888910L
-
Class org.apache.tika.parser.txt.UniversalEncodingDetector
class UniversalEncodingDetector extends Object implements Serializable-
Serialized Fields
-
defaultConfig
UniversalEncodingDetector.Config defaultConfig
-
-
-
Class org.apache.tika.parser.txt.UniversalEncodingDetector.Config
class Config extends Object implements Serializable-
Serialized Fields
-
markLimit
int markLimit
-
-
-
-
Package org.apache.tika.parser.video
-
Class org.apache.tika.parser.video.FLVParser
class FLVParser extends Object implements Serializable- serialVersionUID:
- -8718013155719197679L
-
-
Package org.apache.tika.parser.vlm
-
Class org.apache.tika.parser.vlm.AbstractVLMParser
class AbstractVLMParser extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
defaultConfig
VLMOCRConfig defaultConfig
-
serverAvailable
boolean serverAvailable
-
-
Class org.apache.tika.parser.vlm.ClaudeVLMParser
class ClaudeVLMParser extends AbstractVLMParser implements Serializable- serialVersionUID:
- 1L
-
Class org.apache.tika.parser.vlm.GeminiVLMParser
class GeminiVLMParser extends AbstractVLMParser implements Serializable- serialVersionUID:
- 1L
-
Class org.apache.tika.parser.vlm.OpenAIVLMParser
class OpenAIVLMParser extends AbstractVLMParser implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
apiKeyHeaderName
String apiKeyHeaderName
HTTP header name used to send the API key.Default:
Authorization(standard OpenAI / Bearer auth). For Azure OpenAI, set toapi-key. -
apiKeyPrefix
String apiKeyPrefix
Prefix prepended to the API key value in the auth header.Default:
"Bearer "(note the trailing space). For Azure OpenAI, set to""(empty string) since Azure sends the raw key in theapi-keyheader. -
completionsPath
String completionsPath
URL path appended tobaseUrlfor chat completions requests.Default:
/v1/chat/completions(standard OpenAI path).For Azure OpenAI, set to
/openai/deployments/{deployment}/chat/completions?api-version=2024-02-01.
-
-
Class org.apache.tika.parser.vlm.VLMOCRConfig
class VLMOCRConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
allowRuntimePrompt
boolean allowRuntimePrompt
Iftrue, the prompt may be overridden at parse time via runtime configuration (ParseContext JSON config). Defaults tofalseso that the prompt is locked at initialization time.Enable this only if you trust the source of per-request configuration.
-
apiKey
String apiKey
Optional API key for authenticated endpoints. Empty means no auth. -
baseUrl
String baseUrl
Base URL of the OpenAI-compatible API (no trailing slash). -
completionsPath
String completionsPath
URL path appended tobaseUrlfor chat completions requests. Default is/v1/chat/completions(standard OpenAI path). Override for non-standard endpoints, e.g. Google Gemini:/chat/completionswith baseUrlhttps://generativelanguage.googleapis.com/v1beta/openai. -
inlineContent
boolean inlineContent
Iftrue, when this parser is used on an inline image (embedded resource typeINLINE), the OCR text is written into the parent document's content stream as well as the embedded handler. This mirrors the behaviour ofTesseractOCRParser's inline-content mode. -
maxFileSizeToOcr
long maxFileSizeToOcr
Maximum file size (bytes) to submit to VLM OCR. -
maxImagePixels
long maxImagePixels
Maximum total pixels (width × height) allowed for an image before it is skipped. Prevents sending enormous base64 payloads to the VLM endpoint.Default is 100,000,000 (100 megapixels). Set to
-1for no limit (not recommended). Does not apply to PDF inputs. -
maxTokens
int maxTokens
Maximum number of tokens the model may generate. -
minFileSizeToOcr
long minFileSizeToOcr
Minimum file size (bytes) to submit to VLM OCR. -
model
String model
Model identifier sent in the chat completions request. -
prompt
String prompt
System prompt instructing the VLM how to OCR the image. Override this to request different output formats (markdown, plain text, etc.). -
skipOcr
boolean skipOcr
Whether to skip VLM OCR entirely (runtime kill-switch). -
timeoutSeconds
int timeoutSeconds
HTTP timeout in seconds for the chat completions request. VLM inference can be slow; default is generous.
-
-
Class org.apache.tika.parser.vlm.VLMOCRConfig.RuntimeConfig
class RuntimeConfig extends VLMOCRConfig implements Serializable-
Serialized Fields
-
initMaxTokens
int initMaxTokens
Init-time maxTokens ceiling — runtime requests cannot exceed this.
-
-
-
-
Package org.apache.tika.parser.wacz
-
Class org.apache.tika.parser.wacz.WACZParser
class WACZParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.warc
-
Class org.apache.tika.parser.warc.WARCParser
class WARCParser extends Object implements Serializable
-
-
Package org.apache.tika.parser.wordperfect
-
Class org.apache.tika.parser.wordperfect.QuattroProParser
class QuattroProParser extends Object implements Serializable- serialVersionUID:
- 8941810225917012232L
-
Class org.apache.tika.parser.wordperfect.WordPerfectParser
class WordPerfectParser extends Object implements Serializable- serialVersionUID:
- 8941810225917012232L
-
Serialized Fields
-
includeDeletedContent
boolean includeDeletedContent
-
-
-
Package org.apache.tika.parser.xliff
-
Class org.apache.tika.parser.xliff.XLIFF12Parser
class XLIFF12Parser extends Object implements Serializable- serialVersionUID:
- 1490085649251663857L
-
Class org.apache.tika.parser.xliff.XLZParser
class XLZParser extends Object implements Serializable- serialVersionUID:
- -1877314028666058564L
-
Serialized Fields
-
xliffParser
Parser xliffParser
Shared Parser instance.
-
-
-
Package org.apache.tika.parser.xml
-
Class org.apache.tika.parser.xml.DcXMLParser
class DcXMLParser extends XMLParser implements Serializable- serialVersionUID:
- 4905318835463880819L
-
Class org.apache.tika.parser.xml.FictionBookParser
class FictionBookParser extends XMLParser implements Serializable- serialVersionUID:
- 4195954546491524374L
-
Class org.apache.tika.parser.xml.TextAndAttributeXMLParser
class TextAndAttributeXMLParser extends XMLParser implements Serializable- serialVersionUID:
- 7796914007312429473L
-
Class org.apache.tika.parser.xml.XMLParser
class XMLParser extends Object implements Serializable- serialVersionUID:
- -6028836725280212837L
-
Class org.apache.tika.parser.xml.XMLProfiler
class XMLProfiler extends Object implements Serializable
-
-
Package org.apache.tika.pipes.api
-
Class org.apache.tika.pipes.api.FetchEmitTuple
class FetchEmitTuple extends Object implements Serializable-
Serialized Fields
-
emitKey
EmitKey emitKey
-
fetchKey
FetchKey fetchKey
-
id
String id
-
metadata
Metadata metadata
-
onParseException
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException
-
parseContext
ParseContext parseContext
-
-
-
Record Class org.apache.tika.pipes.api.PipesResult
class PipesResult extends Record implements Serializable-
Serialized Fields
-
emitData
EmitData emitData
-
message
String message
-
status
PipesResult.RESULT_STATUS status
-
-
-
-
Package org.apache.tika.pipes.api.emitter
-
Class org.apache.tika.pipes.api.emitter.EmitKey
class EmitKey extends Object implements Serializable- serialVersionUID:
- -3861669115439125268L
-
Exception org.apache.tika.pipes.api.emitter.EmitterNotFoundException
class EmitterNotFoundException extends TikaException implements Serializable
-
-
Package org.apache.tika.pipes.api.fetcher
-
Exception org.apache.tika.pipes.api.fetcher.FetcherNotFoundException
class FetcherNotFoundException extends TikaException implements Serializable -
Class org.apache.tika.pipes.api.fetcher.FetchKey
class FetchKey extends Object implements Serializable- serialVersionUID:
- -3861669115439125268L
-
-
Package org.apache.tika.pipes.core
-
Class org.apache.tika.pipes.core.EmitStrategyConfig
class EmitStrategyConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
thresholdBytes
Long thresholdBytes
-
type
EmitStrategy type
-
-
Class org.apache.tika.pipes.core.PassbackFilter
class PassbackFilter extends Object implements Serializable -
Exception org.apache.tika.pipes.core.PipesException
class PipesException extends Exception implements Serializable -
Exception org.apache.tika.pipes.core.ServerInitializationException
class ServerInitializationException extends IOException implements Serializable
-
-
Package org.apache.tika.pipes.core.async
-
Exception org.apache.tika.pipes.core.async.OfferLargerThanQueueSize
class OfferLargerThanQueueSize extends IllegalArgumentException implements Serializable-
Serialized Fields
-
queueSize
int queueSize
-
sizeOffered
int sizeOffered
-
-
-
-
Package org.apache.tika.pipes.core.emitter
-
Exception org.apache.tika.pipes.core.emitter.TikaEmitterException
class TikaEmitterException extends IOException implements Serializable
-
-
Package org.apache.tika.pipes.core.extractor
-
Class org.apache.tika.pipes.core.extractor.UnpackConfig
class UnpackConfig extends Object implements Serializable- serialVersionUID:
- -3861669115439125268L
-
Serialized Fields
-
embeddedIdPrefix
String embeddedIdPrefix
-
emitKeyBase
String emitKeyBase
-
emitter
String emitter
-
includeFullMetadata
boolean includeFullMetadata
-
includeMetadataInZip
boolean includeMetadataInZip
-
includeOriginal
boolean includeOriginal
-
keyBaseStrategy
UnpackConfig.KEY_BASE_STRATEGY keyBaseStrategy
-
maxUnpackBytes
long maxUnpackBytes
-
outputFormat
UnpackConfig.OUTPUT_FORMAT outputFormat
-
outputMode
UnpackConfig.OUTPUT_MODE outputMode
-
suffixStrategy
UnpackConfig.SUFFIX_STRATEGY suffixStrategy
-
zeroPadName
int zeroPadName
-
zipEmbeddedFiles
boolean zipEmbeddedFiles
-
-
Class org.apache.tika.pipes.core.extractor.UnpackExtractorFactory
class UnpackExtractorFactory extends Object implements Serializable
-
-
Package org.apache.tika.pipes.core.fetcher
-
Exception org.apache.tika.pipes.core.fetcher.FetcherStringException
class FetcherStringException extends TikaException implements Serializable
-
-
Package org.apache.tika.pipes.core.protocol
-
Exception org.apache.tika.pipes.core.protocol.ProtocolDesyncException
class ProtocolDesyncException extends IOException implements Serializable -
Exception org.apache.tika.pipes.core.protocol.ShutDownReceivedException
class ShutDownReceivedException extends IOException implements Serializable
-
-
Package org.apache.tika.pipes.fork
-
Exception org.apache.tika.pipes.fork.PipesForkParserException
class PipesForkParserException extends TikaException implements Serializable-
Serialized Fields
-
status
PipesResult.RESULT_STATUS status
-
-
-
-
Package org.apache.tika.pipes.ignite
-
Class org.apache.tika.pipes.ignite.ExtensionConfigDTO
class ExtensionConfigDTO extends Object implements Serializable- serialVersionUID:
- 1L
-
-
Package org.apache.tika.pipes.reporter.fs
-
Record Class org.apache.tika.pipes.reporter.fs.FileSystemReporterConfig
class FileSystemReporterConfig extends Record implements Serializable-
Serialized Fields
-
reportUpdateMs
long reportUpdateMs
-
statusFile
Path statusFile
-
-
-
-
Package org.apache.tika.plugins
-
Record Class org.apache.tika.plugins.ExtensionConfig
class ExtensionConfig extends Record implements Serializable
-
-
Package org.apache.tika.renderer
-
Class org.apache.tika.renderer.CompositeRenderer
class CompositeRenderer extends Object implements Serializable
-
-
Package org.apache.tika.renderer.pdf.pdfbox
-
Class org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer
class PDFBoxRenderer extends Object implements Serializable
-
-
Package org.apache.tika.renderer.pdf.poppler
-
Class org.apache.tika.renderer.pdf.poppler.PopplerRenderer
class PopplerRenderer extends Object implements Serializable-
Serialized Fields
-
dpi
int dpi
-
gray
boolean gray
-
maxScaleTo
int maxScaleTo
Maximum pixel dimension (in pixels) for the longest edge of a rendered page image. Maps to pdftoppm's-scale-toflag.If a PDF page would render larger than this value (in pixels) at the configured DPI, pdftoppm scales the output image down so that its longest edge equals
maxScaleTopixels, preserving the aspect ratio. For example, withmaxScaleTo=4096, a landscape page that would normally render to 6000×4000 pixels is scaled to 4096×2731 pixels instead.If the rendered image is already smaller than
maxScaleToon both edges, no scaling is applied — the image is not enlarged.This is the primary defense against pathologically large PDF pages (e.g., architectural drawings, maps, posters) that would otherwise produce multi-gigabyte images and cause OOM.
Default is 4096 pixels. Set to
-1to disable scaling (not recommended). -
pdftoppmPath
String pdftoppmPath
-
timeoutMs
int timeoutMs
-
-
-
-
Package org.apache.tika.sax
-
Class org.apache.tika.sax.AbstractRecursiveParserWrapperHandler
class AbstractRecursiveParserWrapperHandler extends DefaultHandler implements Serializable-
Serialized Fields
-
contentHandlerFactory
ContentHandlerFactory contentHandlerFactory
-
embeddedDepth
int embeddedDepth
-
-
-
Class org.apache.tika.sax.BasicContentHandlerFactory
class BasicContentHandlerFactory extends Object implements Serializable-
Serialized Fields
-
throwOnWriteLimitReached
boolean throwOnWriteLimitReached
-
type
BasicContentHandlerFactory.HANDLER_TYPE type
-
writeLimit
int writeLimit
-
-
-
Class org.apache.tika.sax.RecursiveParserWrapperHandler
class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler implements Serializable -
Class org.apache.tika.sax.SAXOutputConfig
class SAXOutputConfig extends Object implements Serializable- serialVersionUID:
- 1L
-
Serialized Fields
-
includeTitle
boolean includeTitle
Whether to include the <title> element in the XHTML head. Default istruefor backward compatibility. -
writeFileNameToContent
boolean writeFileNameToContent
Whether to write embedded file names to content (e.g., as <h1> elements). Default istruefor backward compatibility. -
writeMetadataToHead
boolean writeMetadataToHead
Whether to write metadata as <meta> elements in the XHTML head. Default istruefor backward compatibility.
-
-
Exception org.apache.tika.sax.StoppingEarlyException
class StoppingEarlyException extends SAXException implements Serializable -
Exception org.apache.tika.sax.TaggedSAXException
class TaggedSAXException extends SAXException implements Serializable-
Serialized Fields
-
tag
Object tag
The object reference used to tag the exception.
-
-
-
-
Package org.apache.tika.serialization
-
Class org.apache.tika.serialization.TikaModule
class TikaModule extends com.fasterxml.jackson.databind.module.SimpleModule implements Serializable -
Exception org.apache.tika.serialization.TikaSerializationException
class TikaSerializationException extends TikaException implements Serializable
-
-
Package org.apache.tika.server.client
-
Exception org.apache.tika.server.client.TikaClientConfigException
class TikaClientConfigException extends TikaException implements Serializable
-
-
Package org.apache.tika.server.core
-
Exception org.apache.tika.server.core.TikaServerParseException
class TikaServerParseException extends jakarta.ws.rs.WebApplicationException implements Serializable
-
-
Package org.apache.tika.utils
-
Class org.apache.tika.utils.XMLReaderUtils
class XMLReaderUtils extends Object implements Serializable- serialVersionUID:
- 6110455808615143122L
-
-
Package org.apache.tika.xmp
-
Class org.apache.tika.xmp.XMPMetadata
class XMPMetadata extends Metadata implements Serializable-
Serialization Methods
-
readObject
- Throws:
ClassNotFoundExceptionIOException
-
writeObject
- Throws:
IOException
-
-
Serialized Fields
-
xmpData
com.adobe.internal.xmp.XMPMeta xmpData
The XMP data
-
-
-