Skip to content

Commit

Permalink
Merge pull request ecprice#49 from carlgieringer/master
Browse files Browse the repository at this point in the history
Fix NYTParser for new article format
  • Loading branch information
ecprice authored Jun 16, 2018
2 parents e901a30 + f85d3d1 commit 279e867
Show file tree
Hide file tree
Showing 6 changed files with 6,801 additions and 20 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ pip-log.txt

newsdiffs.db
database_settings.py

/.pytest_cache/
# pyenv
/.python-version
# IntelliJ
/.idea/
93 changes: 73 additions & 20 deletions parsers/nyt.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import re

from baseparser import BaseParser
from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup


paragraph_wrapper_re = re.compile(r'.*\bStoryBodyCompanionColumn\b.*')

class NYTParser(BaseParser):
SUFFIX = '?pagewanted=all'
Expand Down Expand Up @@ -41,48 +46,96 @@ class NYTParser(BaseParser):
'http://www.nytimes.com/pages/todayspaper/',
'http://topics.nytimes.com/top/opinion/thepubliceditor/']


def _parse(self, html):
soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
soup = BeautifulSoup(html, 'html.parser')
self.meta = soup.findAll('meta')
try:
seo_title = soup.find('meta', attrs={'name':'hdl'}).get('content')
except AttributeError:
self.real_article = False
return
tmp = soup.find('meta', attrs={'name':'hdl_p'})

seo_title = soup.find('meta', attrs={'name': 'hdl'})
if seo_title:
seo_title = seo_title.get('content')
else:
seo_title = soup.find('meta', attrs={'property':"og:title"}).get('content')

tmp = soup.find('meta', attrs={'name': 'hdl_p'})
if tmp and tmp.get('content'):
self.title = tmp.get('content')
else:
meta_og_title = soup.find('meta', attrs={'property': 'og:title'})
if meta_og_title:
self.title = meta_og_title.get('content')
if not self.title:
self.title = seo_title

try:
self.date = soup.find('meta', attrs={'name':'dat'}).get('content')
self.byline = soup.find('meta', attrs={'name':'byl'}).get('content')
except AttributeError:
self.real_article = False
return
try:
self.date = soup.find('time').getText()
self.byline = soup.find('p', attrs={'itemprop': 'author creator'}).getText()
except:
self.real_article = False
return
p_tags = sum([list(soup.findAll('p', attrs=restriction))
for restriction in [{'itemprop': 'articleBody'},
{'itemprop': 'reviewBody'},
{'class':'story-body-text story-content'}
{'class': 'story-body-text story-content'}
]],
[])

if not p_tags:
p_tags = sum([div.findAll(['p', 'h2']) for div in soup.findAll('div', attrs={'class': paragraph_wrapper_re})], [])
if not p_tags:
article = soup.find('article', attrs={'id': 'story'})
article_p_tags = article.findAll('p')

header_p_tags = article.find('header').findAll('p')
bottom_of_article = article.find('div', attrs={'class': 'bottom-of-article'})

p_tags = [
p_tag for p_tag in article_p_tags
if (
p_tag.getText() and
# Remove header p_tags because it duplicates the title
p_tag not in header_p_tags and
# Remove bottom of article p_tags because we add them as the correction
bottom_of_article not in p_tag.parents and
p_tag.getText() != 'Advertisement'
)
]

div = soup.find('div', attrs={'class': 'story-addendum story-content theme-correction'})
if div:
p_tags += [div]
footer = soup.find('footer', attrs={'class':'story-footer story-content'})
footer = soup.find('footer', attrs={'class': 'story-footer story-content'})

if footer:
p_tags += list(footer.findAll(lambda x: x.get('class') != 'story-print-citation' and x.name == 'p'))
p_tags += list(footer.findAll(lambda x: x.get('class') is not None and 'story-print-citation' not in x.get('class') and x.name == 'p'))

main_body = '\n\n'.join([p.getText() for p in p_tags])
authorids = soup.find('div', attrs={'class':'authorIdentification'})
authorid = authorids.getText() if authorids else ''

top_correction = '\n'.join(x.getText() for x in
soup.findAll('nyt_correction_top')) or '\n'
bottom_correction = '\n'.join(x.getText() for x in
soup.findAll('nyt_correction_bottom')) or '\n'
self.body = '\n'.join([top_correction,
main_body,
authorid,
bottom_correction,])

bottom_correction = ''
correction_bottom_tags = soup.findAll('nyt_correction_bottom')
if correction_bottom_tags:
bottom_correction = '\n'.join(x.getText() for x in correction_bottom_tags)
if not correction_bottom_tags:
bottom_of_article = soup.find('div', attrs={'class': 'bottom-of-article'})
if bottom_of_article:
bottom_correction = bottom_of_article.getText()
print_info_index = bottom_correction.find('A version of this article appears in print on')
if print_info_index > -1:
bottom_correction = bottom_correction[:print_info_index]
if not bottom_correction:
bottom_correction = '\n'

self.body = '\n'.join([
top_correction,
main_body,
authorid,
bottom_correction,
])
28 changes: 28 additions & 0 deletions parsers/test_nyt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from mock import patch

from nyt import NYTParser


@patch('parsers.baseparser.grab_url')
def test_old_format(mock_grab_url):
from test_nyt_data import HTML_OLD_FORMAT
mock_grab_url.return_value = HTML_OLD_FORMAT
parser = NYTParser('https://www.nytimes.com/2018/05/18/us/school-shooting-santa-fe-texas.html')
assert 'SANTA FE, Tex.' in parser.body


@patch('parsers.baseparser.grab_url')
def test_new_format(mock_grab_url):
from test_nyt_data import HTML_NEW_FORMAT
mock_grab_url.return_value = HTML_NEW_FORMAT
parser = NYTParser('https://www.nytimes.com/2018/05/16/us/politics/mueller-trump-indictment.html')
assert 'Trump' in parser.body
assert len(parser.body.split('\n\n')) == 28
assert not parser.title.endswith('- The New York Times')

@patch('parsers.baseparser.grab_url')
def test_corrections(mock_grab_url):
from test_nyt_data import HTML_WITH_CORRECTION
mock_grab_url.return_value = HTML_WITH_CORRECTION
parser = NYTParser('https://www.nytimes.com/2018/06/05/world/europe/greece-macedonia.html')
assert 'Correction:' in parser.body
Loading

0 comments on commit 279e867

Please sign in to comment.