From d1d20cbf5e26c7788f2b140700c992b566249318 Mon Sep 17 00:00:00 2001
From: Ari Porad <aporad@olin.edu>
Date: Tue, 22 Feb 2022 16:01:38 -0500
Subject: [PATCH 1/3] Update URLs for SP22 Semester

---
 focstest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/focstest.py b/focstest.py
index 4fcf808..f1e7e50 100755
--- a/focstest.py
+++ b/focstest.py
@@ -29,7 +29,7 @@
 
 
 # default url matching
-BASE_URL = "http://rpucella.net/courses/focs-fa20/homeworks/"  # default website and path to fetch from
+BASE_URL = "http://rpucella.net/courses/focs-sp22/homeworks/"  # default website and path to fetch from
 OCAML_FILE_PATTERN = r"homework(\d{1,2}).ml"  # pattern to extract homework number from the user-given ocaml file
 HTML_FILE_TEMPLATE = "homework{}.html"  # template to build the html filename given a homework number
 
@@ -181,7 +181,7 @@ def infer_url(filepath):
     False
 
     >>> infer_url('foo/bar/homework1.ml')
-    'http://rpucella.net/courses/focs-fa20/homeworks/homework1.html'
+    'http://rpucella.net/courses/focs-sp22/homeworks/homework1.html'
     """
     filename = os.path.basename(filepath)
     match = OCAML_FILE_COMP.match(filename)

From 4f0eb460678cd19c23564abaa52fc11823a6ad4d Mon Sep 17 00:00:00 2001
From: Ari Porad <aporad@olin.edu>
Date: Tue, 22 Feb 2022 16:04:44 -0500
Subject: [PATCH 2/3] Update URL template

Riccardo seems to have changed the format.
---
 focstest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/focstest.py b/focstest.py
index f1e7e50..39c0ad6 100755
--- a/focstest.py
+++ b/focstest.py
@@ -31,7 +31,7 @@
 # default url matching
 BASE_URL = "http://rpucella.net/courses/focs-sp22/homeworks/"  # default website and path to fetch from
 OCAML_FILE_PATTERN = r"homework(\d{1,2}).ml"  # pattern to extract homework number from the user-given ocaml file
-HTML_FILE_TEMPLATE = "homework{}.html"  # template to build the html filename given a homework number
+HTML_FILE_TEMPLATE = "{}/index.html"  # template to build the html filename given a homework number
 
 # selectors for parsing html
 CODE_BLOCK_SELECTOR = 'pre code'  # css selector to get code blocks
@@ -181,7 +181,7 @@ def infer_url(filepath):
     False
 
     >>> infer_url('foo/bar/homework1.ml')
-    'http://rpucella.net/courses/focs-sp22/homeworks/homework1.html'
+    'http://rpucella.net/courses/focs-sp22/homeworks/1/index.html'
     """
     filename = os.path.basename(filepath)
     match = OCAML_FILE_COMP.match(filename)

From 0e98fe6c76433b4033674339acbb74d5588e8446 Mon Sep 17 00:00:00 2001
From: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
Date: Fri, 15 Apr 2022 16:33:15 -0400
Subject: [PATCH 3/3] Improve cache filenames

The new scheme prevents multiple urls ending in index.html from
clashing, and ensures an extension is always present.
---
 focstest.py | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/focstest.py b/focstest.py
index 39c0ad6..4df3f11 100755
--- a/focstest.py
+++ b/focstest.py
@@ -192,6 +192,48 @@ def infer_url(filepath):
     return url
 
 
+def get_cache_filename(url: str) -> str:
+    """Get a filesystem-safe filename based on a url
+
+    >>> get_cache_filename('http://foo.bar/baz/qux/')
+    'foo_bar_baz_qux.html'
+
+    normalizes protocol
+    >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('https://foo.bar/baz/qux')
+    True
+
+    normalizes trailing slashes
+    >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/')
+    True
+
+    normalizes directory names and index.html
+    >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/index.html')
+    True
+    >>> get_cache_filename('http://foo.bar/baz/qux/') == get_cache_filename('http://foo.bar/baz/qux/index.html')
+    True
+
+    two different homeworks end up with different filenames
+    >>> hw1 = infer_url('homework1.ml'); hw2 = infer_url('homework2.ml')
+    >>> get_cache_filename(hw1) != get_cache_filename(hw2)
+    True
+    """
+    BAD_CHARS = {'\0', '\\', '/', ':', '*', '?', '"', '>', '<', '|', ':'}
+
+    parse_result = urllib.parse.urlparse(url)
+    # if Riccardo switches to php and the urls are query-encoded like 'homework.php?id=9', this will need to be updated
+    filename = parse_result.netloc.replace('.', '_') + parse_result.path
+    # normalize trailing / and /index.html
+    filename = filename.rstrip('/')
+    if filename.endswith('/index.html'):
+        filename = filename[:-len('/index.html')]
+    filename = ''.join(c if c not in BAD_CHARS else '_' for c in filename)
+    # default to .html extension
+    path, ext = os.path.splitext(filename)
+    if ext == '':
+        filename += '.html'
+    return filename
+
+
 def main():
     parser = argparse.ArgumentParser(
         description='Run ocaml "doctests".',
@@ -244,7 +286,7 @@ def main():
     if not os.path.exists(CACHE_DIR):
         os.makedirs(CACHE_DIR)
         logger.info('Created cache directory at {!r}'.format(CACHE_DIR))
-    page_name = os.path.basename(urllib.parse.urlparse(URL).path)  # get page name from url
+    page_name = get_cache_filename(URL)
     html_filepath = os.path.join(CACHE_DIR, page_name)  # local filepath
 
     # get webpage if cached version doesn't already exist