diff options
author | Alexander Reelsen <alexander@reelsen.net> | 2016-04-11 14:14:56 +0200 |
---|---|---|
committer | Alexander Reelsen <alexander@reelsen.net> | 2016-04-11 14:14:56 +0200 |
commit | da19ddf3e6922a6659cd21c7dcf5deb1dc9bf366 (patch) | |
tree | cab47b9df3c82f4b72764178500eb2ca36dace18 /plugins/ingest-attachment | |
parent | 2713a08fb32d797800367bc3ec3cef78f4a5be89 (diff) |
Ingest Attachment: Allow to prevent base64 conversions by using raw bytes (#16601)
CBOR is natively supported in Elasticsearch and allows for byte arrays.
This means, that by using CBOR the user can prevent base64 conversions
for the data being sent back and forth.
This PR adds support to extract data from a byte array in addition to
a string. This also required to add a ByteArrayValueSource class.
Diffstat (limited to 'plugins/ingest-attachment')
2 files changed, 33 insertions, 17 deletions
diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index f7edb46f4d..8b3b313356 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -23,7 +23,6 @@ import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.elasticsearch.ElasticsearchParseException; -import org.elasticsearch.common.Base64; import org.elasticsearch.common.Strings; import org.elasticsearch.ingest.core.AbstractProcessor; import org.elasticsearch.ingest.core.AbstractProcessorFactory; @@ -38,7 +37,6 @@ import java.util.Locale; import java.util.Map; import java.util.Set; -import static java.nio.charset.StandardCharsets.UTF_8; import static org.elasticsearch.ingest.core.ConfigurationUtils.newConfigurationException; import static org.elasticsearch.ingest.core.ConfigurationUtils.readIntProperty; import static org.elasticsearch.ingest.core.ConfigurationUtils.readOptionalList; @@ -66,13 +64,12 @@ public final class AttachmentProcessor extends AbstractProcessor { @Override public void execute(IngestDocument ingestDocument) { - String base64Input = ingestDocument.getFieldValue(sourceField, String.class); Map<String, Object> additionalFields = new HashMap<>(); try { - byte[] decodedContent = Base64.decode(base64Input.getBytes(UTF_8)); Metadata metadata = new Metadata(); - String parsedContent = TikaImpl.parse(decodedContent, metadata, indexedChars); + byte[] input = ingestDocument.getFieldValueAsBytes(sourceField); + String parsedContent = TikaImpl.parse(input, metadata, indexedChars); if (fields.contains(Field.CONTENT) && Strings.hasLength(parsedContent)) { // somehow tika seems to append a newline at the end automatically, lets remove that again diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index 5470638ecc..021971c707 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -57,8 +57,7 @@ public class AttachmentProcessorTests extends ESTestCase { public void testEnglishTextDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", - "content_length")); + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\"")); assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); @@ -137,8 +136,8 @@ public class AttachmentProcessorTests extends ESTestCase { public void testHtmlDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "keywords", "title", - "content_type", "content_length")); + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "keywords", "title", "content_type", + "content_length")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("content"), is(notNullValue())); assertThat(attachmentData.get("content_length"), is(notNullValue())); @@ -151,16 +150,15 @@ public class AttachmentProcessorTests extends ESTestCase { public void testXHtmlDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("testXHTML.html", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", - "content_type", "content_length")); + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length")); assertThat(attachmentData.get("content_type").toString(), containsString("application/xhtml+xml")); } public void testEpubDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("testEPUB.epub", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", - "content_type", "content_length", "date", "keywords")); + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", + "date", "keywords")); assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip")); } @@ -168,11 +166,33 @@ public class AttachmentProcessorTests extends ESTestCase { public void testAsciidocDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("asciidoc.asciidoc", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content_type", "content", - "content_length")); + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content_type", "content", "content_length")); assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); } + public void testParseAsBytesArray() throws Exception { + String path = "/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt"; + byte[] bytes; + try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) { + bytes = IOUtils.toByteArray(is); + } + + Map<String, Object> document = new HashMap<>(); + document.put("source_field", bytes); + + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field"); + + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length")); + assertThat(attachmentData.get("language"), is("en")); + assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\"")); + assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); + assertThat(attachmentData.get("content_length"), is(notNullValue())); + } + private Map<String, Object> parseDocument(String file, AttachmentProcessor processor) throws Exception { Map<String, Object> document = new HashMap<>(); document.put("source_field", getAsBase64(file)); @@ -181,8 +201,7 @@ public class AttachmentProcessorTests extends ESTestCase { processor.execute(ingestDocument); @SuppressWarnings("unchecked") - Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata() - .get("target_field"); + Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field"); return attachmentData; } |