summaryrefslogtreecommitdiff
path: root/plugins/ingest-attachment
diff options
context:
space:
mode:
authorDavid Pilato <david@pilato.fr>2016-12-09 16:51:17 +0100
committerDavid Pilato <david@pilato.fr>2017-02-03 13:03:52 +0100
commit2b15d20f9351b2625825153dc87381eb037b20aa (patch)
tree4cc22a02d961bc237635abe93781f38a86cb9232 /plugins/ingest-attachment
parent8ace37e2146195d55eab2e5352114300b2a51ac3 (diff)
Remove support for Visio and POTM files
Actually we never supported Visio files but we are failing hard (kill a node) when that kind of file is provided. See https://github.com/elastic/elasticsearch/pull/22079#issuecomment-277035357 This commits excludes Visio parsing from Tika so it does not fail anymore but returns empty content instead. As a side effect, it also removes support for POTM files. Closes #22077.
Diffstat (limited to 'plugins/ingest-attachment')
-rw-r--r--plugins/ingest-attachment/build.gradle2
-rw-r--r--plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java9
-rw-r--r--plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java36
-rw-r--r--plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docbin0 -> 99840 bytes
-rw-r--r--plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docxbin0 -> 82740 bytes
-rw-r--r--plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdxbin0 -> 210451 bytes
-rw-r--r--plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zipbin6363020 -> 6331462 bytes
7 files changed, 46 insertions, 1 deletions
diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle
index 97b5a23f11..8a4f038b4c 100644
--- a/plugins/ingest-attachment/build.gradle
+++ b/plugins/ingest-attachment/build.gradle
@@ -74,9 +74,11 @@ dependencyLicenses {
}
forbiddenPatterns {
+ exclude '**/*.doc'
exclude '**/*.docx'
exclude '**/*.pdf'
exclude '**/*.epub'
+ exclude '**/*.vsdx'
}
thirdPartyAudit.excludes = [
diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
index f99a2a630a..c7ffe4f287 100644
--- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
+++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
@@ -22,8 +22,10 @@ package org.elasticsearch.ingest.attachment;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.bootstrap.JarHell;
import org.elasticsearch.common.SuppressForbidden;
@@ -45,7 +47,9 @@ import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
import java.security.ProtectionDomain;
import java.security.SecurityPermission;
+import java.util.Collections;
import java.util.PropertyPermission;
+import java.util.Set;
/**
* Runs tika with limited parsers and limited permissions.
@@ -54,6 +58,9 @@ import java.util.PropertyPermission;
*/
final class TikaImpl {
+ /** Exclude some formats */
+ private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));
+
/** subset of parsers for types we support */
private static final Parser PARSERS[] = new Parser[] {
// documents
@@ -63,7 +70,7 @@ final class TikaImpl {
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
- new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
+ ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
index b59457b5b0..e5b9d72017 100644
--- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
@@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.notNullValue;
+import static org.hamcrest.Matchers.nullValue;
import static org.hamcrest.core.IsCollectionContaining.hasItem;
public class AttachmentProcessorTests extends ESTestCase {
@@ -130,6 +131,34 @@ public class AttachmentProcessorTests extends ESTestCase {
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
}
+ public void testWordDocumentWithVisioSchema() throws Exception {
+ Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
+ "content_length"));
+ assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
+ assertThat(attachmentData.get("language"), is("en"));
+ assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
+ assertThat(attachmentData.get("author"), is(notNullValue()));
+ assertThat(attachmentData.get("content_length"), is(notNullValue()));
+ assertThat(attachmentData.get("content_type").toString(),
+ is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+ }
+
+ public void testLegacyWordDocumentWithVisioSchema() throws Exception {
+ Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
+ "content_length"));
+ assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
+ assertThat(attachmentData.get("language"), is("en"));
+ assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
+ assertThat(attachmentData.get("author"), is(notNullValue()));
+ assertThat(attachmentData.get("content_length"), is(notNullValue()));
+ assertThat(attachmentData.get("content_type").toString(),
+ is("application/msword"));
+ }
+
public void testPdf() throws Exception {
Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
assertThat(attachmentData.get("content"),
@@ -138,6 +167,13 @@ public class AttachmentProcessorTests extends ESTestCase {
assertThat(attachmentData.get("content_length"), is(notNullValue()));
}
+ public void testVisioIsExcluded() throws Exception {
+ Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor);
+ assertThat(attachmentData.get("content"), nullValue());
+ assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing"));
+ assertThat(attachmentData.get("content_length"), is(0L));
+ }
+
public void testEncryptedPdf() throws Exception {
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc
new file mode 100644
index 0000000000..10badd5809
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc
Binary files differ
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx
new file mode 100644
index 0000000000..bab550607a
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx
Binary files differ
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx
new file mode 100644
index 0000000000..fb9cde51b4
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx
Binary files differ
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip
index cfc2e54b79..67d1316cb4 100644
--- a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip
Binary files differ