-
Notifications
You must be signed in to change notification settings - Fork 2
/
check-file-miscoded.pl
executable file
·37 lines (31 loc) · 1.18 KB
/
check-file-miscoded.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env perl
use v5.22;
use strict;
use warnings;
# Turn on "file slurp" mode
undef $/;
# Double diamond automatically reads from @ARGV in order, but without involving
# the shell in interpreting file names.
while(<<>>) {
# These specific characters are present throughout the source NAVADMIN text
# and seem to be invariably in the Windows Code Point 1252 character set
if (/[\x91-\x94\x96-\x97\xBC-\xBE]/) {
say "$ARGV: cp1252";
# From some manual work there never seems to be duplicate issues with
# the source file so we bail immediately once we find one.
next;
}
# 3 files in the source data contain these specific 3 bytes at the
# beginning, which are the UTF-8 Byte Order Mark (BOM) which are improperly
# generated by some Windows tools (BOM is only need on multi-byte character
# sets but UTF-8 uses single bytes per character).
if (/^\xEF\xBB\xBF/) {
say "$ARGV: utf8-bom";
next
}
if (/[\x80-\x90\x95\x98-\xBB\xBF-\xFF]/) {
# This would be unexpected based on files we've seen so crash the
# script so we take a look
die "$ARGV: unknown-8bit sequence detected";
}
}