public abstract class AbstractProfiler extends FileResourceConsumer
Modifier and Type | Class and Description |
---|---|
static class |
AbstractProfiler.EXCEPTION_TYPE |
static class |
AbstractProfiler.PARSE_ERROR_TYPE
If information was gathered from the log file about
a parse error
|
Modifier and Type | Field and Description |
---|---|
static String |
FALSE |
protected static AtomicInteger |
ID |
static TableInfo |
MIME_TABLE |
static TableInfo |
REF_EXTRACT_EXCEPTION_TYPES |
static TableInfo |
REF_PARSE_ERROR_TYPES |
static TableInfo |
REF_PARSE_EXCEPTION_TYPES |
static String |
TRUE |
protected IDBWriter |
writer |
ELAPSED_MILLIS, IO_IS, IO_OS, OOM, PARSE_ERR, PARSE_EX, TIMED_OUT
Constructor and Description |
---|
AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
IDBWriter writer) |
Modifier and Type | Method and Description |
---|---|
protected Map<Class,Object> |
calcTextStats(ContentTags contentTags) |
void |
closeWriter() |
protected static ContentTags |
getContent(org.apache.tika.eval.app.EvalFilePaths evalFilePaths,
Metadata metadata) |
protected long |
getFileLength(Path p) |
protected org.apache.tika.eval.app.EvalFilePaths |
getPathsFromExtractCrawl(Metadata metadata,
Path extracts) |
protected org.apache.tika.eval.app.EvalFilePaths |
getPathsFromSrcCrawl(Metadata metadata,
Path srcDir,
Path extracts) |
protected long |
getSourceFileLength(org.apache.tika.eval.app.EvalFilePaths fps,
List<Metadata> metadataList) |
static void |
loadCommonTokens(Path p,
String defaultLangCode) |
void |
setMaxContentLength(int maxContentLength)
Truncate the content string if greater than this length to this length
|
void |
setMaxContentLengthForLangId(int maxContentLengthForLangId)
Truncate content string if greater than this length to this length for lang id
|
void |
setMaxTokens(int maxTokens)
Add a LimitTokenCountFilterFactory if > -1
|
protected static String |
truncateContent(ContentTags contentTags,
int maxLength,
Map<Cols,String> data)
Get the content and record in the data
Cols.CONTENT_TRUNCATED_AT_MAX_LEN whether the string was truncated |
protected void |
writeContentData(String fileId,
Map<Class,Object> textStats,
TableInfo contentsTable)
Checks to see if metadata is null or content is empty (null or only whitespace).
|
protected void |
writeExceptionData(String fileId,
Metadata m,
TableInfo exceptionTable) |
protected void |
writeExtractException(TableInfo extractExceptionTable,
String containerId,
String filePath,
ExtractReaderException.TYPE type) |
protected void |
writeProfileData(org.apache.tika.eval.app.EvalFilePaths fps,
int i,
ContentTags contentTags,
Metadata m,
String fileId,
String containerId,
List<Integer> numAttachments,
TableInfo profileTable) |
call, checkForTimedOutMillis, close, flushAndClose, getCurrentFile, getNumHandledExceptions, getNumResourcesConsumed, getXMLifiedLogMsg, getXMLifiedLogMsg, incrementHandledExceptions, isStillActive, parse, pleaseShutdown, processFileResource
public static final String TRUE
public static final String FALSE
protected static final AtomicInteger ID
public static TableInfo REF_EXTRACT_EXCEPTION_TYPES
public static TableInfo REF_PARSE_ERROR_TYPES
public static TableInfo REF_PARSE_EXCEPTION_TYPES
public static TableInfo MIME_TABLE
protected IDBWriter writer
public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue, IDBWriter writer)
public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException
p
- path to the common_tokens directory. If this is null, try to load from classPathdefaultLangCode
- this is the language code to use if a common_words list doesn't exist for the
detected langauge; can be null
IOException
protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols,String> data)
Cols.CONTENT_TRUNCATED_AT_MAX_LEN
whether the string was truncatedcontentTags
- maxLength
- data
- protected static ContentTags getContent(org.apache.tika.eval.app.EvalFilePaths evalFilePaths, Metadata metadata)
public void setMaxContentLength(int maxContentLength)
maxContentLength
- public void setMaxContentLengthForLangId(int maxContentLengthForLangId)
maxContentLengthForLangId
- public void setMaxTokens(int maxTokens)
maxTokens
- protected void writeExtractException(TableInfo extractExceptionTable, String containerId, String filePath, ExtractReaderException.TYPE type) throws IOException
IOException
protected void writeProfileData(org.apache.tika.eval.app.EvalFilePaths fps, int i, ContentTags contentTags, Metadata m, String fileId, String containerId, List<Integer> numAttachments, TableInfo profileTable)
protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable)
protected Map<Class,Object> calcTextStats(ContentTags contentTags)
protected void writeContentData(String fileId, Map<Class,Object> textStats, TableInfo contentsTable) throws IOException
fileId
- textStats
- contentsTable
- IOException
public void closeWriter() throws IOException
IOException
protected org.apache.tika.eval.app.EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, Path extracts)
metadata
- extracts
- protected org.apache.tika.eval.app.EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, Path extracts)
protected long getSourceFileLength(org.apache.tika.eval.app.EvalFilePaths fps, List<Metadata> metadataList)
protected long getFileLength(Path p)
Copyright © 2007–2022 The Apache Software Foundation. All rights reserved.