diff options
author | David Pilato <david@pilato.fr> | 2016-12-09 16:51:17 +0100 |
---|---|---|
committer | David Pilato <david@pilato.fr> | 2017-02-03 13:03:52 +0100 |
commit | 2b15d20f9351b2625825153dc87381eb037b20aa (patch) | |
tree | 4cc22a02d961bc237635abe93781f38a86cb9232 /plugins/ingest-attachment | |
parent | 8ace37e2146195d55eab2e5352114300b2a51ac3 (diff) |
Remove support for Visio and POTM files
Actually we never supported Visio files but we are failing hard (kill a node) when that kind of file is provided.
See https://github.com/elastic/elasticsearch/pull/22079#issuecomment-277035357
This commits excludes Visio parsing from Tika so it does not fail anymore but returns empty content instead.
As a side effect, it also removes support for POTM files.
Closes #22077.
Diffstat (limited to 'plugins/ingest-attachment')
7 files changed, 46 insertions, 1 deletions
diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index 97b5a23f11..8a4f038b4c 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -74,9 +74,11 @@ dependencyLicenses { } forbiddenPatterns { + exclude '**/*.doc' exclude '**/*.docx' exclude '**/*.pdf' exclude '**/*.epub' + exclude '**/*.vsdx' } thirdPartyAudit.excludes = [ diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java index f99a2a630a..c7ffe4f287 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java @@ -22,8 +22,10 @@ package org.elasticsearch.ingest.attachment; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.elasticsearch.SpecialPermission; import org.elasticsearch.bootstrap.JarHell; import org.elasticsearch.common.SuppressForbidden; @@ -45,7 +47,9 @@ import java.security.PrivilegedActionException; import java.security.PrivilegedExceptionAction; import java.security.ProtectionDomain; import java.security.SecurityPermission; +import java.util.Collections; import java.util.PropertyPermission; +import java.util.Set; /** * Runs tika with limited parsers and limited permissions. @@ -54,6 +58,9 @@ import java.util.PropertyPermission; */ final class TikaImpl { + /** Exclude some formats */ + private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml")); + /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents @@ -63,7 +70,7 @@ final class TikaImpl { new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), - new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), + ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index b59457b5b0..e5b9d72017 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.notNullValue; +import static org.hamcrest.Matchers.nullValue; import static org.hamcrest.core.IsCollectionContaining.hasItem; public class AttachmentProcessorTests extends ESTestCase { @@ -130,6 +131,34 @@ public class AttachmentProcessorTests extends ESTestCase { is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); } + public void testWordDocumentWithVisioSchema() throws Exception { + Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor); + + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", + "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); + assertThat(attachmentData.get("language"), is("en")); + assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); + assertThat(attachmentData.get("author"), is(notNullValue())); + assertThat(attachmentData.get("content_length"), is(notNullValue())); + assertThat(attachmentData.get("content_type").toString(), + is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + } + + public void testLegacyWordDocumentWithVisioSchema() throws Exception { + Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor); + + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", + "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); + assertThat(attachmentData.get("language"), is("en")); + assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); + assertThat(attachmentData.get("author"), is(notNullValue())); + assertThat(attachmentData.get("content_length"), is(notNullValue())); + assertThat(attachmentData.get("content_type").toString(), + is("application/msword")); + } + public void testPdf() throws Exception { Map<String, Object> attachmentData = parseDocument("test.pdf", processor); assertThat(attachmentData.get("content"), @@ -138,6 +167,13 @@ public class AttachmentProcessorTests extends ESTestCase { assertThat(attachmentData.get("content_length"), is(notNullValue())); } + public void testVisioIsExcluded() throws Exception { + Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor); + assertThat(attachmentData.get("content"), nullValue()); + assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing")); + assertThat(attachmentData.get("content_length"), is(0L)); + } + public void testEncryptedPdf() throws Exception { ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor)); assertThat(e.getDetailedMessage(), containsString("document is encrypted")); diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc Binary files differnew file mode 100644 index 0000000000..10badd5809 --- /dev/null +++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx Binary files differnew file mode 100644 index 0000000000..bab550607a --- /dev/null +++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx Binary files differnew file mode 100644 index 0000000000..fb9cde51b4 --- /dev/null +++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip Binary files differindex cfc2e54b79..67d1316cb4 100644 --- a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip +++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip |