Skip to content

Commit

Permalink
fix: use title and subject for email/tweet #1173
Browse files Browse the repository at this point in the history
  • Loading branch information
mvanzalu committed Sep 4, 2023
1 parent 571bccf commit 1483075
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,29 @@ Map<String, Object> getDuplicateMap(TikaDocument document) {
return jsonDocument;
}

String getTitle(Metadata metadata) {
if (metadata.get(DublinCore.SUBJECT) != null && !metadata.get(DublinCore.SUBJECT).isEmpty()) {
return metadata.get(DublinCore.SUBJECT);
} else if (metadata.get(DublinCore.TITLE) != null && !metadata.get(DublinCore.TITLE).isEmpty()) {
return metadata.get(DublinCore.TITLE);
} else {
return metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
protected boolean isEmail(Metadata metadata) {
String contentType = ofNullable(metadata.get(CONTENT_TYPE)).orElse(DEFAULT_VALUE_UNKNOWN);
return contentType.startsWith("message/") || contentType.equals("application/vnd.ms-outlook");
}

protected boolean isTweet(Metadata metadata) {
return ofNullable(metadata.get(CONTENT_TYPE)).orElse(DEFAULT_VALUE_UNKNOWN).equals("application/json; twint");
}

protected String getTitle(Metadata metadata) {
if (isEmail(metadata)) {
if (metadata.get(DublinCore.SUBJECT) != null && !metadata.get(DublinCore.SUBJECT).isEmpty()) {
return metadata.get(DublinCore.SUBJECT);
} else if (metadata.get(DublinCore.TITLE) != null && !metadata.get(DublinCore.TITLE).isEmpty()) {
return metadata.get(DublinCore.TITLE);
}
}
if (isTweet(metadata)) {
if (metadata.get(DublinCore.TITLE) != null && !metadata.get(DublinCore.TITLE).isEmpty()) {
return metadata.get(DublinCore.TITLE);
}
}
return metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
}

public ElasticsearchSpewer withRefresh(WriteRequest.RefreshPolicy refreshPolicy) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
package org.icij.datashare.text.indexing.elasticsearch;

import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParsingReader;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
import org.elasticsearch.action.get.GetRequest;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
import org.elasticsearch.index.reindex.DeleteByQueryRequest;
import org.elasticsearch.rest.RestRequest;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.icij.datashare.HumanReadableSize;
import org.icij.datashare.PropertiesProvider;
Expand All @@ -28,6 +35,7 @@
import org.icij.extract.extractor.UpdatableDigester;
import org.icij.spewer.FieldNames;
import org.icij.task.Options;
import org.junit.After;
import org.junit.ClassRule;
import org.junit.Test;
import org.mockito.ArgumentCaptor;
Expand All @@ -42,6 +50,7 @@
import java.util.Objects;

import static java.nio.file.Paths.get;
import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
import static org.elasticsearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE;
import static org.fest.assertions.Assertions.assertThat;
import static org.fest.assertions.MapAssert.entry;
Expand Down Expand Up @@ -201,33 +210,78 @@ public void test_title_and_title_norm() throws Exception {
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "t-file.txt"));
}

@Test
public void test_title_and_title_norm_for_an_tweet() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Tweet-File.json"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
document.getMetadata().set(CONTENT_TYPE, "application/json; twint");

spewer.write(document);

GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Tweet-File.json"));
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "tweet-file.json"));
}

@Test
public void test_title_and_title_norm_for_an_tweet_with_dc_title() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Tweet-File.json"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
document.getMetadata().set(CONTENT_TYPE, "application/json; twint");
document.getMetadata().set(DublinCore.TITLE, "This is a tweet.");

spewer.write(document);

GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("title", "This is a tweet."));
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "this is a tweet."));
}

@Test
public void test_title_and_title_norm_for_an_email() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("E-File.txt"));
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Email-File.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
document.getMetadata().set(DublinCore.TITLE, "Email Title");
document.getMetadata().set(CONTENT_TYPE, "message/http");

spewer.write(document);

GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Email Title"));
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "email title"));
assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Email-File.txt"));
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "email-file.txt"));
}

@Test
public void test_title_and_title_norm_for_a_tweet() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Tweet-File.txt"));
public void test_title_and_title_norm_for_an_email_with_dc_title() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Email-File.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
document.getMetadata().set(DublinCore.TITLE, "Email Title");
document.getMetadata().set(DublinCore.SUBJECT, "Tweet Title");
document.getMetadata().set(CONTENT_TYPE, "message/http");
document.getMetadata().set(DublinCore.TITLE, "This is an email.");

spewer.write(document);

GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("title", "Tweet Title"));
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "tweet title"));
assertThat(documentFields.getSourceAsMap()).includes(entry("title", "This is an email."));
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "this is an email."));
}

@Test
public void test_title_and_title_norm_for_an_email_with_dc_subject() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("Email-File.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
document.getMetadata().set(CONTENT_TYPE, "message/http");
document.getMetadata().set(DublinCore.TITLE, "This is an email.");
document.getMetadata().set(DublinCore.SUBJECT, "This is a more detailed email.");

spewer.write(document);

GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("title", "This is a more detailed email."));
assertThat(documentFields.getSourceAsMap()).includes(entry("titleNorm", "this is a more detailed email."));
}

@Test
Expand Down Expand Up @@ -334,6 +388,17 @@ public void test_get_max_content_length_is_limited_to_2G() {
.isEqualTo(HumanReadableSize.parse("2G")-1); // Integer.MAX_VALUE
}

@After
public void after() {
try {
DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest("test-datashare");
deleteByQueryRequest.setQuery(QueryBuilders.matchAllQuery());
es.client.deleteByQuery(deleteByQueryRequest, RequestOptions.DEFAULT).getDeleted();
} catch (IOException e) {
e.printStackTrace();
}
}

private Map<String, Object> convert(Metadata metadata) {
Map<String, Object> map = new HashMap<>();
for (String name: metadata.names()) {
Expand Down

0 comments on commit 1483075

Please sign in to comment.