Skip to content

Commit

Permalink
Update filename construction
Browse files Browse the repository at this point in the history
  • Loading branch information
VictorHarbo committed Jul 24, 2023
1 parent c7eb59d commit 3bf93ff
Showing 1 changed file with 12 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

Expand Down Expand Up @@ -109,7 +108,7 @@ private ArcEntry addArcEntryToZip(ArcEntry entry, ZipOutputStream zos, WarcMetad
* @param warcMetadata contains the timestamp, id, originalUrl and file extension, which is used to create the filename.
* @return a string in the format timestamp_id_originalUrlStrippedForNonASCIIChars.extension.
*/
private String createFilename(WarcMetadataFromSolr warcMetadata) {
private String createFilename(WarcMetadataFromSolr warcMetadata) {

String filename;
if (warcMetadata.getMimetype().contains("text/html")) {
Expand All @@ -123,7 +122,17 @@ private String createFilename(WarcMetadataFromSolr warcMetadata) {
}

// Remove everything non-alphanumerical or underscore
filename = filename.replaceAll("[^A-Za-z0-9_]", "");
return normalizeFilename(filename);
}

public static String normalizeFilename(String filename) {
filename = filename.replaceAll("[^A-Za-z0-9_.]", "");

// Remove all but last dot
filename = filename.substring(0, filename.lastIndexOf(".")).replaceAll("\\." , "").concat(filename.substring(filename.lastIndexOf(".")));

//Remove trailing underscore
filename = filename.replaceAll("_\\.", ".");
// Remove two or more consecutive underscores
filename = filename.replaceAll("_{2,}", "_");

Expand Down

0 comments on commit 3bf93ff

Please sign in to comment.