Package org.apache.tika.parser.microsoft
Class WordExtractor
- java.lang.Object
-
- org.apache.tika.parser.microsoft.WordExtractor
-
public class WordExtractor extends Object
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description static class
WordExtractor.TagAndStyle
-
Field Summary
Fields Modifier and Type Field Description protected ParseContext
context
protected OfficeParserConfig
officeParserConfig
protected Metadata
parentMetadata
-
Constructor Summary
Constructors Constructor Description WordExtractor(ParseContext context, Metadata metadata)
-
Method Summary
All Methods Static Methods Instance Methods Concrete Methods Deprecated Methods Modifier and Type Method Description static WordExtractor.TagAndStyle
buildParagraphTagAndStyle(String styleName, boolean isTable)
Given a style name, return what tag should be used, and what style should be applied to it.protected Detector
getDetector()
protected MimeTypes
getMimeTypes()
Deprecated.protected String
getPassword()
Returns the password to be used for this file, or null if no / default password should be usedprotected TikaConfig
getTikaConfig()
protected void
handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
Handle an office document that's embedded at the POIFS levelprotected void
handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, XHTMLContentHandler xhtml)
Handle an office document that's embedded at the POIFS levelprotected void
handleEmbeddedResource(TikaInputStream resource, String filename, String relationshipID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml)
protected void
handleEmbeddedResource(TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml)
protected void
handleEmbeddedResource(TikaInputStream resource, Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml)
protected void
parse(org.apache.poi.poifs.filesystem.DirectoryNode root, XHTMLContentHandler xhtml)
protected void
parse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
protected void
parseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, XHTMLContentHandler xhtml)
protected void
parseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
-
-
-
Field Detail
-
parentMetadata
protected final Metadata parentMetadata
-
officeParserConfig
protected final OfficeParserConfig officeParserConfig
-
context
protected final ParseContext context
-
-
Constructor Detail
-
WordExtractor
public WordExtractor(ParseContext context, Metadata metadata)
-
-
Method Detail
-
buildParagraphTagAndStyle
public static WordExtractor.TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable)
Given a style name, return what tag should be used, and what style should be applied to it.
-
parse
protected void parse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException
- Throws:
IOException
SAXException
TikaException
-
parse
protected void parse(org.apache.poi.poifs.filesystem.DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException
- Throws:
IOException
SAXException
TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException
- Throws:
IOException
SAXException
TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException
- Throws:
IOException
SAXException
TikaException
-
getTikaConfig
protected TikaConfig getTikaConfig()
-
getDetector
protected Detector getDetector()
-
getMimeTypes
protected MimeTypes getMimeTypes()
Deprecated.- Returns:
- mimetypes
-
getPassword
protected String getPassword()
Returns the password to be used for this file, or null if no / default password should be used
-
handleEmbeddedResource
protected void handleEmbeddedResource(TikaInputStream resource, String filename, String relationshipID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, TikaException
- Throws:
IOException
SAXException
TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, TikaException
- Throws:
IOException
SAXException
TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(TikaInputStream resource, Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, TikaException
- Throws:
IOException
SAXException
TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException
Handle an office document that's embedded at the POIFS level- Throws:
IOException
SAXException
TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException
Handle an office document that's embedded at the POIFS level- Throws:
IOException
SAXException
TikaException
-
-