Merge pull request #1 from codeforIATI/add-urls

Add URLs to output
codeforIATI · Aug 25, 2021 · 6369cd4 · 6369cd4
2 parents c0fee9d + 2fd45b4
commit 6369cd4
Showing 1 changed file with 84 additions and 77 deletions.
diff --git a/glide.py b/glide.py
@@ -3,98 +3,105 @@
 import csv
 import os
 import datetime
+BASE_URL = "https://glidenumber.net"
 SEARCH_URL = "https://glidenumber.net/glide/public/search/search.jsp"
 URL = "https://glidenumber.net/glide/public/result/report.jsp"
 
 CSV_FILENAME = "output/glide-emergencies.csv"
-HEADERS = ["GLIDE_number", "Event", "Country", "Date", "Event_Code", "Country_Code", "Glide_Serial", "Comments"]
+HEADERS = ["GLIDE_number", "URL", "Event", "Country", "Date", "Event_Code", "Country_Code", "Glide_Serial", "Comments"]
 
 REQUEST_HEADERS = {
-  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
 }
 
 
 def download(csv):
-  def get_t(row, i):
-    return row.xpath("td")[i].text_content().strip()
+    def get_t(row, i):
+        return row.xpath("td")[i].text_content().strip()
 
-  def make_date(value):
-    year, month, day = value.split("/")
-    try:
-      return datetime.date(year=int(year),
-          month=int(month),
-          day=int(day)
-        ).isoformat()
-    # Some dates are formatted incorrectly as
-    #  yyyy/d/mm rather than
-    #  yyyy/m/dd
-    except ValueError:
-      return value
+    def make_date(value):
+        year, month, day = value.split("/")
+        try:
+            return datetime.date(
+                year=int(year),
+                month=int(month),
+                day=int(day)
+            ).isoformat()
+        # Some dates are formatted incorrectly as
+        #    yyyy/d/mm rather than
+        #    yyyy/m/dd
+        except ValueError:
+            return value
 
+    s = requests.session()
+    s.headers.update(REQUEST_HEADERS)
 
-  s = requests.session()
-  s.headers.update(REQUEST_HEADERS)
+    search_post_data = [
+        ("level0", "*"),
+        ("level1", "*"),
+        ("events", "*"),
+        ("keywords", ""),
+        ("ftoption", "&"),
+        ("fromyear", ""),
+        ("frommonth", ""),
+        ("fromday", ""),
+        ("toyear", ""),
+        ("tomonth", ""),
+        ("today", ""),
+        ("maxhits", "10000"),
+        ("sortby", "0"),
+        ("X_Resolution", "1920"),
+        ("nStart", "0"),
+        ("posted", "0"),
+        ("process", "/public/result/report.jsp"),
+        ("go.x", "Search")
+    ]
+    print("Opening GLIDEnumber.net")
+    r = s.post(SEARCH_URL, data=search_post_data)
+    doc = html.fromstring(r.text)
+    urls = [
+        BASE_URL + a.get('href').split('&', 1)[0]
+        for a in doc.xpath('//table')[6].xpath('tr/td[1]/a')]
 
-  search_post_data = [
-    ("level0", "*"),
-    ("level1", "*"),
-    ("events", "*"),
-    ("keywords", ""),
-    ("ftoption", "&"),
-    ("fromyear", ""),
-    ("frommonth", ""),
-    ("fromday", ""),
-    ("toyear", ""),
-    ("tomonth", ""),
-    ("today", ""),
-    ("maxhits", "10000"),
-    ("sortby", "0"),
-    ("X_Resolution", "1920"),
-    ("nStart", "0"),
-    ("posted", "0"),
-    ("process", "/public/result/report.jsp"),
-    ("go.x", "Search")
-  ]
-  print("Opening GLIDEnumber.net")
-  s.post(SEARCH_URL, data=search_post_data)
+    post_data = [
+        ("continueReport", "Continue"),
+        ("unlimited", "Y"),
+        ("variables", "disasters.sEventId || '-' || sGlide || '-' ||    sLocationCode as GLIDE_number"),
+        ("variables", "sEventName as Event"),
+        ("variables", "geography.sLocation as Country"),
+        ("variables", "(CAST(nyear as varchar(8)) || '/' || CAST(nmonth    as varchar(8)) || '/' || CAST(nday as varchar(8))) as Date"),
+        ("variables", "disasters.seventid as Event_Code"),
+        ("variables", "slocationcode as Country_Code"),
+        ("variables", "sglide as Glide_Serial"),
+        ("variables", "scomments as Comments"),
+    ]
 
-  post_data = [
-    ("continueReport", "Continue"),
-    ("unlimited", "Y"),
-    ("variables", "disasters.sEventId || '-' || sGlide || '-' ||  sLocationCode as GLIDE_number"),
-    ("variables", "sEventName as Event"),
-    ("variables", "geography.sLocation as Country"),
-    ("variables", "(CAST(nyear as varchar(8)) || '/' || CAST(nmonth  as varchar(8)) || '/' || CAST(nday as varchar(8))) as Date"),
-    ("variables", "disasters.seventid as Event_Code"),
-    ("variables", "slocationcode as Country_Code"),
-    ("variables", "sglide as Glide_Serial"),
-    ("variables", "scomments as Comments"),
-  ]
-
-  print("Requesting list of GLIDE numbers, this may take a moment...")
-  r = s.post(URL, data=post_data)
-  doc = html.fromstring(r.text)
-  rows = doc.xpath("//table[3]")[0].xpath("tr/td/table[2]/tr")
-  print("Found {} entries".format(len(rows)))
-  for row in rows:
-    #if not row.xpath("tr/td[@class='bfS']"): continue
-    if (len(row.xpath("td"))!=8):
-      print("Irregular column width, skipping")
-      continue
-    csv.writerow({"GLIDE_number": get_t(row, 0),
-      "Event": get_t(row, 1),
-      "Country": get_t(row, 2),
-      "Date": make_date(get_t(row, 3)),
-      "Event_Code": get_t(row, 4),
-      "Country_Code": get_t(row, 5),
-      "Glide_Serial": get_t(row, 6),
-      "Comments": get_t(row, 7)
-    })
+    print("Requesting list of GLIDE numbers, this may take a moment...")
+    r = s.post(URL, data=post_data)
+    doc = html.fromstring(r.text)
+    rows = doc.xpath("//table")[2].xpath("tr/td/table[2]/tr")
+    print("Found {} entries".format(len(rows)))
+    for row, url in zip(rows, urls):
+        # if not row.xpath("tr/td[@class='bfS']"): continue
+        if (len(row.xpath("td")) != 8):
+            print("Irregular column width, skipping")
+            continue
+        csv.writerow({
+            "GLIDE_number": get_t(row, 0),
+            "URL": url,
+            "Event": get_t(row, 1),
+            "Country": get_t(row, 2),
+            "Date": make_date(get_t(row, 3)),
+            "Event_Code": get_t(row, 4),
+            "Country_Code": get_t(row, 5),
+            "Glide_Serial": get_t(row, 6),
+            "Comments": get_t(row, 7),
+        })
 
 
 if __name__ == "__main__":
-  os.makedirs("output", exist_ok=True)
-  with open(CSV_FILENAME, "w") as csv_file:
-    csv = csv.DictWriter(csv_file, fieldnames=HEADERS)
-    csv.writeheader()
-    download(csv)
+    os.makedirs("output", exist_ok=True)
+    with open(CSV_FILENAME, "w") as csv_file:
+        csv = csv.DictWriter(csv_file, fieldnames=HEADERS)
+        csv.writeheader()
+        download(csv)