Skip to content

Commit

Permalink
scripts : script to get Paul Graham essays in txt format (ggerganov#4838
Browse files Browse the repository at this point in the history
)
  • Loading branch information
ggerganov authored Jan 9, 2024
1 parent 128de35 commit d965389
Showing 1 changed file with 47 additions and 0 deletions.
47 changes: 47 additions & 0 deletions scripts/get-pg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

function usage {
echo "usage: <n>$0"
exit 1
}

function has_cmd {
if ! [ -x "$(command -v $1)" ]; then
echo "error: $1 is not available" >&2
exit 1
fi
}

# check for: curl, html2text, tail, sed, fmt
has_cmd curl
has_cmd html2text
has_cmd tail
has_cmd sed

if [ $# -ne 1 ]; then
usage
fi

n=$1

# get urls
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"

printf "urls:\n%s\n" "$urls"

if [ -f pg.txt ]; then
rm pg.txt
fi

for url in $urls; do
echo "processing $url"

curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt

# don't flood the server
sleep 1
done

echo "done. data in pg.txt"

exit 0

0 comments on commit d965389

Please sign in to comment.