Skip to content

Commit

Permalink
scripts : improve get-pg.sh (ggerganov#4838)
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Jan 9, 2024
1 parent 18adb4e commit 9a818f7
Showing 1 changed file with 24 additions and 1 deletion.
25 changes: 24 additions & 1 deletion scripts/get-pg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@

function usage {
echo "usage: <n>$0"
echo "note: n is the number of essays to download"
echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
echo "n | tokens"
echo "--- | ---"
echo "1 | 6230"
echo "2 | 23619"
echo "5 | 25859"
echo "10 | 36888"
echo "15 | 50188"
echo "20 | 59094"
echo "25 | 88764"
echo "30 | 103121"
echo "32 | 108338"
echo "35 | 113403"
echo "40 | 127699"
echo "45 | 135896"
exit 1
}

Expand Down Expand Up @@ -33,10 +49,17 @@ if [ -f pg.txt ]; then
rm pg.txt
fi

c=1
for url in $urls; do
echo "processing $url"

curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
cc=$(printf "%03d" $c)

curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
cat pg-$cc-one.txt >> pg.txt

cp -v pg.txt pg-$cc-all.txt
c=$((c+1))

# don't flood the server
sleep 1
Expand Down

0 comments on commit 9a818f7

Please sign in to comment.