summaryrefslogtreecommitdiff
path: root/plugins/ingest-attachment
diff options
context:
space:
mode:
authorAlexander Reelsen <alexander@reelsen.net>2016-04-11 14:14:56 +0200
committerAlexander Reelsen <alexander@reelsen.net>2016-04-11 14:14:56 +0200
commitda19ddf3e6922a6659cd21c7dcf5deb1dc9bf366 (patch)
treecab47b9df3c82f4b72764178500eb2ca36dace18 /plugins/ingest-attachment
parent2713a08fb32d797800367bc3ec3cef78f4a5be89 (diff)
Ingest Attachment: Allow to prevent base64 conversions by using raw bytes (#16601)
CBOR is natively supported in Elasticsearch and allows for byte arrays. This means, that by using CBOR the user can prevent base64 conversions for the data being sent back and forth. This PR adds support to extract data from a byte array in addition to a string. This also required to add a ByteArrayValueSource class.
Diffstat (limited to 'plugins/ingest-attachment')
-rw-r--r--plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java7
-rw-r--r--plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java43
2 files changed, 33 insertions, 17 deletions
diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java
index f7edb46f4d..8b3b313356 100644
--- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java
+++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java
@@ -23,7 +23,6 @@ import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.elasticsearch.ElasticsearchParseException;
-import org.elasticsearch.common.Base64;
import org.elasticsearch.common.Strings;
import org.elasticsearch.ingest.core.AbstractProcessor;
import org.elasticsearch.ingest.core.AbstractProcessorFactory;
@@ -38,7 +37,6 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
-import static java.nio.charset.StandardCharsets.UTF_8;
import static org.elasticsearch.ingest.core.ConfigurationUtils.newConfigurationException;
import static org.elasticsearch.ingest.core.ConfigurationUtils.readIntProperty;
import static org.elasticsearch.ingest.core.ConfigurationUtils.readOptionalList;
@@ -66,13 +64,12 @@ public final class AttachmentProcessor extends AbstractProcessor {
@Override
public void execute(IngestDocument ingestDocument) {
- String base64Input = ingestDocument.getFieldValue(sourceField, String.class);
Map<String, Object> additionalFields = new HashMap<>();
try {
- byte[] decodedContent = Base64.decode(base64Input.getBytes(UTF_8));
Metadata metadata = new Metadata();
- String parsedContent = TikaImpl.parse(decodedContent, metadata, indexedChars);
+ byte[] input = ingestDocument.getFieldValueAsBytes(sourceField);
+ String parsedContent = TikaImpl.parse(input, metadata, indexedChars);
if (fields.contains(Field.CONTENT) && Strings.hasLength(parsedContent)) {
// somehow tika seems to append a newline at the end automatically, lets remove that again
diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
index 5470638ecc..021971c707 100644
--- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
@@ -57,8 +57,7 @@ public class AttachmentProcessorTests extends ESTestCase {
public void testEnglishTextDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor);
- assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type",
- "content_length"));
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\""));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
@@ -137,8 +136,8 @@ public class AttachmentProcessorTests extends ESTestCase {
public void testHtmlDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
- assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "keywords", "title",
- "content_type", "content_length"));
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "keywords", "title", "content_type",
+ "content_length"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("content"), is(notNullValue()));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
@@ -151,16 +150,15 @@ public class AttachmentProcessorTests extends ESTestCase {
public void testXHtmlDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("testXHTML.html", processor);
- assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title",
- "content_type", "content_length"));
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length"));
assertThat(attachmentData.get("content_type").toString(), containsString("application/xhtml+xml"));
}
public void testEpubDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("testEPUB.epub", processor);
- assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title",
- "content_type", "content_length", "date", "keywords"));
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length",
+ "date", "keywords"));
assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip"));
}
@@ -168,11 +166,33 @@ public class AttachmentProcessorTests extends ESTestCase {
public void testAsciidocDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("asciidoc.asciidoc", processor);
- assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content_type", "content",
- "content_length"));
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content_type", "content", "content_length"));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
}
+ public void testParseAsBytesArray() throws Exception {
+ String path = "/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt";
+ byte[] bytes;
+ try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) {
+ bytes = IOUtils.toByteArray(is);
+ }
+
+ Map<String, Object> document = new HashMap<>();
+ document.put("source_field", bytes);
+
+ IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
+ processor.execute(ingestDocument);
+
+ @SuppressWarnings("unchecked")
+ Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field");
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
+ assertThat(attachmentData.get("language"), is("en"));
+ assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\""));
+ assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
+ assertThat(attachmentData.get("content_length"), is(notNullValue()));
+ }
+
private Map<String, Object> parseDocument(String file, AttachmentProcessor processor) throws Exception {
Map<String, Object> document = new HashMap<>();
document.put("source_field", getAsBase64(file));
@@ -181,8 +201,7 @@ public class AttachmentProcessorTests extends ESTestCase {
processor.execute(ingestDocument);
@SuppressWarnings("unchecked")
- Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata()
- .get("target_field");
+ Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field");
return attachmentData;
}