-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpdf-splitter.sh
executable file
·105 lines (83 loc) · 3.84 KB
/
pdf-splitter.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
function display_usage()
{
echo '###############################################################################'
echo '# pdf-splitter is a bash-script that splits a pdf-document whenever #'
echo '# a barcode/qr-code is found that matches the regular expression (regex) #'
echo '# #'
echo '# Author: Daniel Forrer, 2018-07-09 #'
echo '# #'
echo '# Libraries: ghostscript, poppler-utils (pdfinfo), #'
echo '# zbar-tools (barcode-detection) #'
echo '# #'
echo '# Example: A pdf-file with 6 pages has a barcode which matches the regex on #'
echo '# the first page, the third page and the last page. #'
echo '# Result: The script will split the pdf into the following pieces: #'
echo '# Page 1-2, Page 3-5, Page 6 #'
echo '# #'
echo '# Usage: The 1. argument is the path to the input-PDF-file. #'
echo '# The 2. argument is the regular expression. #'
echo '# Example: bash pdf-splitter.sh ~/input/merged2.pdf 20[1-3][0-9]000[0-9]{6} #'
echo '# #'
echo '# Tested on: Linux Mint 18.3 Sylvia, macos 10.13.5 #'
echo '###############################################################################'
}
# Source: https://www.linuxjournal.com/content/tech-tip-extract-pages-pdf
function pdfpextr()
{
# this function uses 3 arguments:
# $1 is the first page of the range to extract
# $2 is the last page of the range to extract
# $3 is the input file
# output file will be named "inputfile_pXX-pYY.pdf"
exportcommand=$(echo "pdftk ${3} cat ${1}-${2} output ${3%.pdf}_p${1}-p${2}.pdf")
eval "$exportcommand"
}
# Make sure we have two parameters, then save them
if [ ! $# -eq 2 ]; then
echo ' Wrong number of parameters! '
display_usage
exit 0
fi
inputfile=$1
regex=$2
# Create temporary directory and make sure it gets deleted on EXIT
tempdir=$(mktemp -d "${TMPDIR:-/tmp/}$(basename $0).XXXXXXXXXXXX")
cd $tempdir
trap "rm -rf $tempdir" EXIT
totalpages=$(pdfinfo $inputfile | grep Pages | awk '{print $2}')
echo "Inputfile '$inputfile' has $totalpages pages"
# fromPage is the beginning of the next pdf
fromPage=1
currentPage=1
while [ $currentPage -lt $totalpages ]
do
currentPage=$(($currentPage+1))
echo "currentPage: $currentPage"
# use Ghostscript to extract the current page of the PDF-file to a single JPEG-file
gs -o $tempdir/currentPage.jpeg \
-sDEVICE=jpeg \
-dNOPAUSE -r300x300 \
-dFirstPage=$currentPage \
-dLastPage=$currentPage \
$inputfile
# extract ALL the barcodes from currentPage.jpeg
# because the first extracted barcode might not be the one we are looking for
barcodes="$(zbarimg --raw -q currentPage.jpeg)"
echo $barcodes
# match the regular expression against the barcodes-string
if [[ $barcodes =~ $regex ]]
then
echo "Regex match found"
# here we export the pages fromPage to currentPage-1 as a pdf
pdfpextr $fromPage $(($currentPage-1)) $inputfile
fromPage=$currentPage
else
echo "No regex match found"
fi
if [[ $totalpages -eq $currentPage ]]
then
echo "$totalpages = $currentPage (totalpages = currentPage)"
pdfpextr $fromPage $currentPage $inputfile
fi
done