Skip to content

Commit

Permalink
Add normalization of filename and contenttype
Browse files Browse the repository at this point in the history
  • Loading branch information
VictorHarbo committed Jul 20, 2023
1 parent 0c4293e commit dc9d5c0
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import java.io.IOException;
import java.io.OutputStream;
import java.text.Normalizer;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

Expand All @@ -33,7 +34,8 @@ public class StreamingRawZipExport {
public void getStreamingOutputWithZipOfContent(String query, String contentType,
OutputStream output, String... filterQueries) throws IOException {

String fullFilters = SolrUtils.combineFilterQueries("content_type", contentType, filterQueries);
String normalizedContentType = normalizeContentType(contentType);
String fullFilters = SolrUtils.combineFilterQueries("content_type", normalizedContentType, filterQueries);

SRequest request = SRequest.builder()
.query(query)
Expand All @@ -47,12 +49,22 @@ public void getStreamingOutputWithZipOfContent(String query, String contentType,
long streamedDocs = SolrGenericStreaming.create(request).stream()
.map(doc -> extractMetadata(doc, warcMetadata))
.map(StreamingRawZipExport::safeGetArcEntry)
.map(entry -> addArcEntryToZip(entry, zos, contentType, warcMetadata))
.map(entry -> addArcEntryToZip(entry, zos, normalizedContentType, warcMetadata))
.count();

zos.close();
output.close();
log.info("Streamed {} warc entries with the contentType: '{}'.", streamedDocs, contentType);
log.info("Streamed {} warc entries with the contentType: '{}'.", streamedDocs, normalizedContentType);
}

/**
* Normalize content type.
* @param contentType string to normalize.
* @return normalized contentType string.
*/
private String normalizeContentType(String contentType) {
String normalizedContentType = Normalizer.normalize(contentType, Normalizer.Form.NFD);
return normalizedContentType.replaceAll("[^\\x00-\\x7F]", "");
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import java.net.URI;
import java.net.URL;
import java.text.DateFormat;
import java.text.Normalizer;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
Expand Down Expand Up @@ -481,8 +482,11 @@ private String createZipFilename(String contentType) {
Date date = new Date() ;
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");

return dateFormat.format(date) + "_" + contentType.replaceAll(" ", "")
.replaceAll("/", "_") + "_export.zip";
String filename = dateFormat.format(date) + "_" + contentType.replaceAll(" ", "")
.replaceAll("/", "_") + "_export.zip";
filename = Normalizer.normalize(filename, Normalizer.Form.NFD);

return filename.replaceAll("[^\\x00-\\x7F]", "");
}


Expand Down

0 comments on commit dc9d5c0

Please sign in to comment.