From d1d20cbf5e26c7788f2b140700c992b566249318 Mon Sep 17 00:00:00 2001 From: Ari Porad Date: Tue, 22 Feb 2022 16:01:38 -0500 Subject: [PATCH 1/3] Update URLs for SP22 Semester --- focstest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/focstest.py b/focstest.py index 4fcf808..f1e7e50 100755 --- a/focstest.py +++ b/focstest.py @@ -29,7 +29,7 @@ # default url matching -BASE_URL = "http://rpucella.net/courses/focs-fa20/homeworks/" # default website and path to fetch from +BASE_URL = "http://rpucella.net/courses/focs-sp22/homeworks/" # default website and path to fetch from OCAML_FILE_PATTERN = r"homework(\d{1,2}).ml" # pattern to extract homework number from the user-given ocaml file HTML_FILE_TEMPLATE = "homework{}.html" # template to build the html filename given a homework number @@ -181,7 +181,7 @@ def infer_url(filepath): False >>> infer_url('foo/bar/homework1.ml') - 'http://rpucella.net/courses/focs-fa20/homeworks/homework1.html' + 'http://rpucella.net/courses/focs-sp22/homeworks/homework1.html' """ filename = os.path.basename(filepath) match = OCAML_FILE_COMP.match(filename) From 4f0eb460678cd19c23564abaa52fc11823a6ad4d Mon Sep 17 00:00:00 2001 From: Ari Porad Date: Tue, 22 Feb 2022 16:04:44 -0500 Subject: [PATCH 2/3] Update URL template Riccardo seems to have changed the format. --- focstest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/focstest.py b/focstest.py index f1e7e50..39c0ad6 100755 --- a/focstest.py +++ b/focstest.py @@ -31,7 +31,7 @@ # default url matching BASE_URL = "http://rpucella.net/courses/focs-sp22/homeworks/" # default website and path to fetch from OCAML_FILE_PATTERN = r"homework(\d{1,2}).ml" # pattern to extract homework number from the user-given ocaml file -HTML_FILE_TEMPLATE = "homework{}.html" # template to build the html filename given a homework number +HTML_FILE_TEMPLATE = "{}/index.html" # template to build the html filename given a homework number # selectors for parsing html CODE_BLOCK_SELECTOR = 'pre code' # css selector to get code blocks @@ -181,7 +181,7 @@ def infer_url(filepath): False >>> infer_url('foo/bar/homework1.ml') - 'http://rpucella.net/courses/focs-sp22/homeworks/homework1.html' + 'http://rpucella.net/courses/focs-sp22/homeworks/1/index.html' """ filename = os.path.basename(filepath) match = OCAML_FILE_COMP.match(filename) From 0e98fe6c76433b4033674339acbb74d5588e8446 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Fri, 15 Apr 2022 16:33:15 -0400 Subject: [PATCH 3/3] Improve cache filenames The new scheme prevents multiple urls ending in index.html from clashing, and ensures an extension is always present. --- focstest.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/focstest.py b/focstest.py index 39c0ad6..4df3f11 100755 --- a/focstest.py +++ b/focstest.py @@ -192,6 +192,48 @@ def infer_url(filepath): return url +def get_cache_filename(url: str) -> str: + """Get a filesystem-safe filename based on a url + + >>> get_cache_filename('http://foo.bar/baz/qux/') + 'foo_bar_baz_qux.html' + + normalizes protocol + >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('https://foo.bar/baz/qux') + True + + normalizes trailing slashes + >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/') + True + + normalizes directory names and index.html + >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/index.html') + True + >>> get_cache_filename('http://foo.bar/baz/qux/') == get_cache_filename('http://foo.bar/baz/qux/index.html') + True + + two different homeworks end up with different filenames + >>> hw1 = infer_url('homework1.ml'); hw2 = infer_url('homework2.ml') + >>> get_cache_filename(hw1) != get_cache_filename(hw2) + True + """ + BAD_CHARS = {'\0', '\\', '/', ':', '*', '?', '"', '>', '<', '|', ':'} + + parse_result = urllib.parse.urlparse(url) + # if Riccardo switches to php and the urls are query-encoded like 'homework.php?id=9', this will need to be updated + filename = parse_result.netloc.replace('.', '_') + parse_result.path + # normalize trailing / and /index.html + filename = filename.rstrip('/') + if filename.endswith('/index.html'): + filename = filename[:-len('/index.html')] + filename = ''.join(c if c not in BAD_CHARS else '_' for c in filename) + # default to .html extension + path, ext = os.path.splitext(filename) + if ext == '': + filename += '.html' + return filename + + def main(): parser = argparse.ArgumentParser( description='Run ocaml "doctests".', @@ -244,7 +286,7 @@ def main(): if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) logger.info('Created cache directory at {!r}'.format(CACHE_DIR)) - page_name = os.path.basename(urllib.parse.urlparse(URL).path) # get page name from url + page_name = get_cache_filename(URL) html_filepath = os.path.join(CACHE_DIR, page_name) # local filepath # get webpage if cached version doesn't already exist