From 14830751b4629b08c9ecf5307d33914722108858 Mon Sep 17 00:00:00 2001 From: mvanzalu Date: Mon, 4 Sep 2023 10:35:51 +0000 Subject: [PATCH] fix: use title and subject for email/tweet #1173 --- .../elasticsearch/ElasticsearchSpewer.java | 29 +++++-- .../ElasticsearchSpewerTest.java | 85 ++++++++++++++++--- 2 files changed, 97 insertions(+), 17 deletions(-) diff --git a/datashare-index/src/main/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewer.java b/datashare-index/src/main/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewer.java index 876241703..65607daa7 100644 --- a/datashare-index/src/main/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewer.java +++ b/datashare-index/src/main/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewer.java @@ -160,14 +160,29 @@ Map getDuplicateMap(TikaDocument document) { return jsonDocument; } - String getTitle(Metadata metadata) { - if (metadata.get(DublinCore.SUBJECT) != null && !metadata.get(DublinCore.SUBJECT).isEmpty()) { - return metadata.get(DublinCore.SUBJECT); - } else if (metadata.get(DublinCore.TITLE) != null && !metadata.get(DublinCore.TITLE).isEmpty()) { - return metadata.get(DublinCore.TITLE); - } else { - return metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + protected boolean isEmail(Metadata metadata) { + String contentType = ofNullable(metadata.get(CONTENT_TYPE)).orElse(DEFAULT_VALUE_UNKNOWN); + return contentType.startsWith("message/") || contentType.equals("application/vnd.ms-outlook"); + } + + protected boolean isTweet(Metadata metadata) { + return ofNullable(metadata.get(CONTENT_TYPE)).orElse(DEFAULT_VALUE_UNKNOWN).equals("application/json; twint"); + } + + protected String getTitle(Metadata metadata) { + if (isEmail(metadata)) { + if (metadata.get(DublinCore.SUBJECT) != null && !metadata.get(DublinCore.SUBJECT).isEmpty()) { + return metadata.get(DublinCore.SUBJECT); + } else if (metadata.get(DublinCore.TITLE) != null && !metadata.get(DublinCore.TITLE).isEmpty()) { + return metadata.get(DublinCore.TITLE); + } + } + if (isTweet(metadata)) { + if (metadata.get(DublinCore.TITLE) != null && !metadata.get(DublinCore.TITLE).isEmpty()) { + return metadata.get(DublinCore.TITLE); + } } + return metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); } public ElasticsearchSpewer withRefresh(WriteRequest.RefreshPolicy refreshPolicy) { diff --git a/datashare-index/src/test/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewerTest.java b/datashare-index/src/test/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewerTest.java index 5b10759a0..91f660a39 100644 --- a/datashare-index/src/test/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewerTest.java +++ b/datashare-index/src/test/java/org/icij/datashare/text/indexing/elasticsearch/ElasticsearchSpewerTest.java @@ -1,14 +1,21 @@ package org.icij.datashare.text.indexing.elasticsearch; import org.apache.tika.metadata.DublinCore; +import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParsingReader; +import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; import org.elasticsearch.action.get.GetRequest; import org.elasticsearch.action.get.GetResponse; +import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.index.query.QueryStringQueryBuilder; +import org.elasticsearch.index.query.SimpleQueryStringBuilder; +import org.elasticsearch.index.reindex.DeleteByQueryRequest; +import org.elasticsearch.rest.RestRequest; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.icij.datashare.HumanReadableSize; import org.icij.datashare.PropertiesProvider; @@ -28,6 +35,7 @@ import org.icij.extract.extractor.UpdatableDigester; import org.icij.spewer.FieldNames; import org.icij.task.Options; +import org.junit.After; import org.junit.ClassRule; import org.junit.Test; import org.mockito.ArgumentCaptor; @@ -42,6 +50,7 @@ import java.util.Objects; import static java.nio.file.Paths.get; +import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE; import static org.elasticsearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE; import static org.fest.assertions.Assertions.assertThat; import static org.fest.assertions.MapAssert.entry; @@ -201,33 +210,78 @@ public void test_title_and_title_norm() throws Exception { assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "t-file.txt")); } + @Test + public void test_title_and_title_norm_for_an_tweet() throws Exception { + final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Tweet-File.json")); + final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes())); + document.setReader(reader); + document.getMetadata().set(CONTENT_TYPE, "application/json; twint"); + + spewer.write(document); + + GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT); + assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Tweet-File.json")); + assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "tweet-file.json")); + } + + @Test + public void test_title_and_title_norm_for_an_tweet_with_dc_title() throws Exception { + final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Tweet-File.json")); + final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes())); + document.setReader(reader); + document.getMetadata().set(CONTENT_TYPE, "application/json; twint"); + document.getMetadata().set(DublinCore.TITLE, "This is a tweet."); + + spewer.write(document); + + GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT); + assertThat(documentFields.getSourceAsMap()).includes(entry("title", "This is a tweet.")); + assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "this is a tweet.")); + } + @Test public void test_title_and_title_norm_for_an_email() throws Exception { - final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("E-File.txt")); + final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Email-File.txt")); final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes())); document.setReader(reader); - document.getMetadata().set(DublinCore.TITLE, "Email Title"); + document.getMetadata().set(CONTENT_TYPE, "message/http"); spewer.write(document); GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT); - assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Email Title")); - assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "email title")); + assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Email-File.txt")); + assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "email-file.txt")); } @Test - public void test_title_and_title_norm_for_a_tweet() throws Exception { - final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Tweet-File.txt")); + public void test_title_and_title_norm_for_an_email_with_dc_title() throws Exception { + final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Email-File.txt")); final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes())); document.setReader(reader); - document.getMetadata().set(DublinCore.TITLE, "Email Title"); - document.getMetadata().set(DublinCore.SUBJECT, "Tweet Title"); + document.getMetadata().set(CONTENT_TYPE, "message/http"); + document.getMetadata().set(DublinCore.TITLE, "This is an email."); spewer.write(document); GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT); - assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Tweet Title")); - assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "tweet title")); + assertThat(documentFields.getSourceAsMap()).includes(entry("title", "This is an email.")); + assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "this is an email.")); + } + + @Test + public void test_title_and_title_norm_for_an_email_with_dc_subject() throws Exception { + final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Email-File.txt")); + final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes())); + document.setReader(reader); + document.getMetadata().set(CONTENT_TYPE, "message/http"); + document.getMetadata().set(DublinCore.TITLE, "This is an email."); + document.getMetadata().set(DublinCore.SUBJECT, "This is a more detailed email."); + + spewer.write(document); + + GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT); + assertThat(documentFields.getSourceAsMap()).includes(entry("title", "This is a more detailed email.")); + assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "this is a more detailed email.")); } @Test @@ -334,6 +388,17 @@ public void test_get_max_content_length_is_limited_to_2G() { .isEqualTo(HumanReadableSize.parse("2G")-1); // Integer.MAX_VALUE } + @After + public void after() { + try { + DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest("test-datashare"); + deleteByQueryRequest.setQuery(QueryBuilders.matchAllQuery()); + es.client.deleteByQuery(deleteByQueryRequest, RequestOptions.DEFAULT).getDeleted(); + } catch (IOException e) { + e.printStackTrace(); + } + } + private Map convert(Metadata metadata) { Map map = new HashMap<>(); for (String name: metadata.names()) {