Merge pull request ecprice#49 from carlgieringer/master

Fix NYTParser for new article format
0xrin1 · Jun 16, 2018 · 279e867 · 279e867
2 parents e901a30 + f85d3d1
commit 279e867
Show file tree

Hide file tree

Showing 6 changed files with 6,801 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,9 @@ pip-log.txt
 
 newsdiffs.db
 database_settings.py
+
+/.pytest_cache/
+# pyenv
+/.python-version
+# IntelliJ
+/.idea/
diff --git a/parsers/nyt.py b/parsers/nyt.py
@@ -1,5 +1,10 @@
+import re
+
 from baseparser import BaseParser
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
+
+
+paragraph_wrapper_re = re.compile(r'.*\bStoryBodyCompanionColumn\b.*')
 
 class NYTParser(BaseParser):
     SUFFIX = '?pagewanted=all'
@@ -41,48 +46,96 @@ class NYTParser(BaseParser):
                     'http://www.nytimes.com/pages/todayspaper/',
                     'http://topics.nytimes.com/top/opinion/thepubliceditor/']
 
-
     def _parse(self, html):
-        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
+        soup = BeautifulSoup(html, 'html.parser')
         self.meta = soup.findAll('meta')
-        try:
-            seo_title = soup.find('meta', attrs={'name':'hdl'}).get('content')
-        except AttributeError:
-            self.real_article = False
-            return
-        tmp = soup.find('meta', attrs={'name':'hdl_p'})
+
+        seo_title = soup.find('meta', attrs={'name': 'hdl'})
+        if seo_title:
+            seo_title = seo_title.get('content')
+        else:
+            seo_title = soup.find('meta', attrs={'property':"og:title"}).get('content')
+
+        tmp = soup.find('meta', attrs={'name': 'hdl_p'})
         if tmp and tmp.get('content'):
             self.title = tmp.get('content')
         else:
+            meta_og_title = soup.find('meta', attrs={'property': 'og:title'})
+            if meta_og_title:
+                self.title = meta_og_title.get('content')
+        if not self.title:
             self.title = seo_title
+
         try:
             self.date = soup.find('meta', attrs={'name':'dat'}).get('content')
             self.byline = soup.find('meta', attrs={'name':'byl'}).get('content')
         except AttributeError:
-            self.real_article = False
-            return
+            try:
+                self.date = soup.find('time').getText()
+                self.byline = soup.find('p', attrs={'itemprop': 'author creator'}).getText()
+            except:
+                self.real_article = False
+                return
         p_tags = sum([list(soup.findAll('p', attrs=restriction))
                       for restriction in [{'itemprop': 'articleBody'},
                                           {'itemprop': 'reviewBody'},
-                                          {'class':'story-body-text story-content'}
+                                          {'class': 'story-body-text story-content'}
                                       ]],
                      [])
+
+        if not p_tags:
+            p_tags = sum([div.findAll(['p', 'h2']) for div in soup.findAll('div', attrs={'class': paragraph_wrapper_re})], [])
+        if not p_tags:
+            article = soup.find('article', attrs={'id': 'story'})
+            article_p_tags = article.findAll('p')
+
+            header_p_tags = article.find('header').findAll('p')
+            bottom_of_article = article.find('div', attrs={'class': 'bottom-of-article'})
+
+            p_tags = [
+                p_tag for p_tag in article_p_tags
+                if (
+                    p_tag.getText() and
+                    # Remove header p_tags because it duplicates the title
+                    p_tag not in header_p_tags and
+                    # Remove bottom of article p_tags because we add them as the correction
+                    bottom_of_article not in p_tag.parents and
+                    p_tag.getText() != 'Advertisement'
+                )
+            ]
+
         div = soup.find('div', attrs={'class': 'story-addendum story-content theme-correction'})
         if div:
             p_tags += [div]
-        footer = soup.find('footer', attrs={'class':'story-footer story-content'})
+        footer = soup.find('footer', attrs={'class': 'story-footer story-content'})
+
         if footer:
-            p_tags += list(footer.findAll(lambda x: x.get('class') != 'story-print-citation' and x.name == 'p'))
+            p_tags += list(footer.findAll(lambda x: x.get('class') is not None and 'story-print-citation' not in x.get('class') and x.name == 'p'))
 
         main_body = '\n\n'.join([p.getText() for p in p_tags])
         authorids = soup.find('div', attrs={'class':'authorIdentification'})
         authorid = authorids.getText() if authorids else ''
 
         top_correction = '\n'.join(x.getText() for x in
                                    soup.findAll('nyt_correction_top')) or '\n'
-        bottom_correction = '\n'.join(x.getText() for x in
-                                   soup.findAll('nyt_correction_bottom')) or '\n'
-        self.body = '\n'.join([top_correction,
-                               main_body,
-                               authorid,
-                               bottom_correction,])
+
+        bottom_correction = ''
+        correction_bottom_tags = soup.findAll('nyt_correction_bottom')
+        if correction_bottom_tags:
+            bottom_correction = '\n'.join(x.getText() for x in correction_bottom_tags)
+        if not correction_bottom_tags:
+            bottom_of_article = soup.find('div', attrs={'class': 'bottom-of-article'})
+            if bottom_of_article:
+                bottom_correction = bottom_of_article.getText()
+                print_info_index = bottom_correction.find('A version of this article appears in print on')
+                if print_info_index > -1:
+                    bottom_correction = bottom_correction[:print_info_index]
+        if not bottom_correction:
+            bottom_correction = '\n'
+
+        self.body = '\n'.join([
+            top_correction,
+            main_body,
+            authorid,
+            bottom_correction,
+        ])
diff --git a/parsers/test_nyt.py b/parsers/test_nyt.py
@@ -0,0 +1,28 @@
+from mock import patch
+
+from nyt import NYTParser
+
+
+@patch('parsers.baseparser.grab_url')
+def test_old_format(mock_grab_url):
+    from test_nyt_data import HTML_OLD_FORMAT
+    mock_grab_url.return_value = HTML_OLD_FORMAT
+    parser = NYTParser('https://www.nytimes.com/2018/05/18/us/school-shooting-santa-fe-texas.html')
+    assert 'SANTA FE, Tex.' in parser.body
+
+
+@patch('parsers.baseparser.grab_url')
+def test_new_format(mock_grab_url):
+    from test_nyt_data import HTML_NEW_FORMAT
+    mock_grab_url.return_value = HTML_NEW_FORMAT
+    parser = NYTParser('https://www.nytimes.com/2018/05/16/us/politics/mueller-trump-indictment.html')
+    assert 'Trump' in parser.body
+    assert len(parser.body.split('\n\n')) == 28
+    assert not parser.title.endswith('- The New York Times')
+
+@patch('parsers.baseparser.grab_url')
+def test_corrections(mock_grab_url):
+    from test_nyt_data import HTML_WITH_CORRECTION
+    mock_grab_url.return_value = HTML_WITH_CORRECTION
+    parser = NYTParser('https://www.nytimes.com/2018/06/05/world/europe/greece-macedonia.html')
+    assert 'Correction:' in parser.body