diff --git a/VERSIONS b/VERSIONS index 268f16fe..109b3203 100644 --- a/VERSIONS +++ b/VERSIONS @@ -1,3 +1,9 @@ +Changes in VERSION 0.1.02 + Added ability to get "main image" + - Document class now has main_image_dict attribute which is populated + by the summary() method + - Added a requirements.txt + Changes in VERSION 0.1.01 Began customizing Readability -Added several divs to unlikelyCandidatesRe diff --git a/readability/readability.py b/readability/readability.py index 41a023c5..4bd05484 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -2,6 +2,10 @@ import logging import re import sys +import requests + +from PIL import Image +from StringIO import StringIO from collections import defaultdict from lxml.etree import tostring @@ -99,6 +103,68 @@ def __init__(self, input, **options): self.input = input self.options = options self.html = None + self.main_image_dict = {} + + def _get_main_image_dict(self, html_string): + """ Try to find the main image in the given html string """ + min_pix_area = 10000 # 100 * 100 + + # Transform the html string into an lxml tree + doc = build_doc(html_string) + + # Build list of img tags + tags = [] + for t in self.tags(doc, 'img'): + tags.append(t) + + # Get the urls out of the img tags + image_urls = [tag.attrib['src'] for tag in tags] + + # Get actual image data + images = [] + for u in image_urls: + r = requests.get(u) + if r.status_code != 200: + continue + + try: + image_data = Image.open(StringIO(r.content)) + except IOError: + continue + + images.append( + { + 'url': u, + 'size': image_data.size, + 'pix-area': image_data.size[0] * image_data.size[1], + 'object': image_data + }) + + # Filter out images that are not big enough + def big_enough(image_dict): + if image_dict['pix-area'] < min_pix_area: + return False + return True + + images = filter(big_enough, images) + + # If we have no images we return an empty dict + if not images: + return {} + + # If there is only one image then we will use it as the main image + if len(images) == 1: + return images[0] + + # If we make it here then we have more than 1 image. We will return the + # largest image. + largest_pix_area = 0 + largest_image_dict = {} + for i in images: + if i['pix-area'] > largest_pix_area: + largest_image_dict = i + + return largest_image_dict def _html(self, force=False): if force or self.html is None: @@ -176,6 +242,8 @@ def summary(self, html_partial=False): # Loop through and try again. continue else: + # Try to get the main image + self.main_image_dict = self._get_main_image_dict(cleaned_article) return cleaned_article except StandardError, e: log.exception('error getting summary: ') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..c1b60c29 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +lxml==2.3.3 +PIL==1.1.7 +requests==0.13.3 diff --git a/setup.py b/setup.py index 59851754..7c8a32ed 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="readability-lxml", - version="0.1.01", # Akimbo Specific Version + version="0.1.02", # Akimbo Specific Version author="Yuri Baburov", author_email="burchik@gmail.com", description="fast python port of arc90's readability tool",