From cfacada8537c42663fe73fd3b7768cc89a2e64a1 Mon Sep 17 00:00:00 2001
From: mxamin <amin.solhizadeh@gmail.com>
Date: Mon, 5 Sep 2016 14:50:25 +0430
Subject: [PATCH 1/2] Add Farsi (Persian) Language Support

---
 README.rst                            |  17 ++
 goose/resources/text/stopwords-fa.txt | 218 ++++++++++++++++++++++++++
 goose/text.py                         |  23 +++
 requirements.txt                      |   1 +
 4 files changed, 259 insertions(+)
 create mode 100755 goose/resources/text/stopwords-fa.txt

diff --git a/README.rst b/README.rst
index 5dc8ab0b..07bccd14 100644
--- a/README.rst
+++ b/README.rst
@@ -178,6 +178,23 @@ class.
     >>> print article.cleaned_text[:150]
     دمشق، سوريا (CNN) -- أكدت جهات سورية معارضة أن فصائل مسلحة معارضة لنظام الرئيس بشار الأسد وعلى صلة بـ"الجيش الحر" تمكنت من السيطرة على مستودعات للأسل
 
+Goose in Farsi (Persian)
+---------------
+
+In order to use Goose in Farsi you have to use the StopWordsFarsi
+class.
+
+::
+
+    >>> from goose import Goose
+    >>> from goose.text import StopWordsFarsi
+    >>> url = 'http://www.tabnak.ir/fa/news/620497/'
+    >>> g = Goose({'stopwords_class': StopWordsFarsi})
+    >>> article = g.extract(url=url)
+    >>> print article.cleaned_text[:150]
+	در مراسم تجلیل از مدال آوران المپیک عکس های جالبی به بیرون مخابره شد.
+	به گزارش تابناک ورزشی، روز گذشته مراسم تجلیل از مدال آوران المپیک توسط علی
+
 
 Goose in Korean
 ----------------
diff --git a/goose/resources/text/stopwords-fa.txt b/goose/resources/text/stopwords-fa.txt
new file mode 100755
index 00000000..99182648
--- /dev/null
+++ b/goose/resources/text/stopwords-fa.txt
@@ -0,0 +1,218 @@
+گیرید
+کردی
+خواستن
+بیاب
+بخواهد
+آورد
+گیریم
+کردید
+خواستند
+بیابد
+بخواهم
+آوردم
+مي‌شود
+کردیم
+خواسته
+بیابم
+بخواهند
+آوردن
+هست
+کن
+خواستی
+بیابند
+بخواهی
+آوردند
+هستم
+کند
+خواستید
+بیابی
+بخواهید
+آورده
+هستند
+کنم
+خواستیم
+بیابید
+بخواهیم
+آوردی
+هستی
+کنند
+خواهد
+بیابیم
+بکن
+آوردید
+هستید
+کنی
+خواهم
+بیاور
+بکند
+آوردیم
+هستیم
+کنید
+خواهند
+بیاورد
+بکنم
+آورم
+یابد
+کنیم
+خواهی
+بیاورم
+بکنند
+آورند
+یابم
+گرفت
+خواهید
+بیاورند
+بکنی
+آوری
+یابند
+گرفتم
+خواهیم
+بیاوری
+بکنید
+آورید
+یابی
+گرفتن
+داد
+بیاورید
+بکنیم
+آوریم
+یابید
+گرفتند
+دار
+بیاوریم
+بگو
+آید
+یابیم
+گرفته
+دارد
+بیاید
+بگوید
+آیم
+یافت
+گرفتی
+دارم
+بیایم
+بگویم
+آیند
+یافتم
+گرفتید
+دارند
+بیایند
+بگویند
+آیی
+یافتن
+گرفتیم
+داری
+بیایی
+بگویی
+آیید
+یافتند
+گفت
+دارید
+بیایید
+بگویید
+آییم
+یافته
+گفتم
+داریم
+بیاییم
+بگوییم
+باش
+یافتی
+گفتن
+داشت
+تواند
+بگیر
+باشد
+یافتید
+گفتند
+داشتم
+توانست
+بگیرد
+باشد
+یافتیم
+گفته
+داشتن
+توانستم
+بگیرم
+باشم
+گفتی
+داشتند
+توانستن
+بگیرند
+باشند
+گفتید
+داشته
+توانستند
+بگیری
+باشی
+گفتیم
+داشتی
+توانسته
+بگیرید
+باشید
+گوید
+داشتید
+توانستی
+بگیریم
+باشیم
+و
+در
+به
+از
+كه
+اين
+با
+می
+را
+های
+براي
+ها
+آن
+وي
+يك
+خود
+بر
+ای
+نيز
+تا
+ما
+بايد
+اند
+هم
+بود
+نمی
+هر
+یا
+دو
+آن‌ها
+اما
+ديگر
+اگر
+همچنین
+است
+آیا
+او
+براى
+ندارد
+که
+می
+های
+شود
+ای
+ولی
+شده
+کرده
+کنم
+ام
+نمی
+یا
+بودن
+کنند
+من
+شما
+بنده
+اینکه
+بعد
+خواهد شد
+دادم
\ No newline at end of file
diff --git a/goose/text.py b/goose/text.py
index 3ef63d6b..0365f6e8 100644
--- a/goose/text.py
+++ b/goose/text.py
@@ -168,6 +168,29 @@ def candiate_words(self, stripped_input):
         return words
 
 
+class StopWordsFarsi(StopWords):
+    """
+    Farsi segmentation
+    """
+    def __init__(self, language='fa'):
+        super(StopWordsFarsi, self).__init__(language='fa')
+
+    def remove_punctuation(self, content):
+        return content
+
+    def candiate_words(self, stripped_input):
+        import hazm
+        normalizer = hazm.Normalizer()
+        stemmer = hazm.Stemmer()
+
+        words = []
+        norm_input = normalizer.normalize(stripped_input)
+        for word in hazm.word_tokenize(norm_input):
+            words.append(stemmer.stem(word))
+
+        return words
+
+
 class StopWordsKorean(StopWords):
     """
     Korean segmentation
diff --git a/requirements.txt b/requirements.txt
index 7e6a6c09..72e06246 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ cssselect
 jieba
 beautifulsoup
 nltk
+hazm
\ No newline at end of file

From 3ba1af2aab8a2bd68737de8629e1327cb5b3fa60 Mon Sep 17 00:00:00 2001
From: mxamin <amin.solhizadeh@gmail.com>
Date: Tue, 6 Sep 2016 15:25:06 +0430
Subject: [PATCH 2/2] Bugfix: Fix Some Incompatibility With Unicode URL

- Python hashing function sometimes falls into a recursion loop (I
  don't know why?!) when a unencoded text passed to it.

- Before passing a encoded url to `urllib2.urlopen`, the url MUST
  be quoted, otherwise we will receive HTTP 400 (bad request) response
  from web server. It's better to use `requests` module instead of
  `urllib2` which handles these situations internally.
---
 goose/network.py        | 14 ++++++++++++++
 goose/utils/__init__.py |  2 +-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/goose/network.py b/goose/network.py
index 666a7d61..9712b148 100644
--- a/goose/network.py
+++ b/goose/network.py
@@ -20,7 +20,9 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+import urllib
 import urllib2
+import urlparse
 
 
 class HtmlFetcher(object):
@@ -42,6 +44,18 @@ def get_html(self, url):
         if isinstance(url, unicode):
             url = url.encode('utf-8')
 
+        # for a unicode address the path, query and fragment part
+        # of url must be quote, otherwise we receive a HTTP 400
+        # (bad request) response from web server
+        res = urlparse.urlsplit(url)
+        url = urlparse.SplitResult(
+            scheme=res.scheme,
+            netloc=res.netloc,
+            path=urllib.quote(res.path),
+            query=urllib.quote(res.query),
+            fragment=urllib.quote(res.fragment)
+        ).geturl()
+
         # set request
         self.request = urllib2.Request(
                         url,
diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py
index 5a1de7d4..ba69ed07 100644
--- a/goose/utils/__init__.py
+++ b/goose/utils/__init__.py
@@ -101,7 +101,7 @@ def get_parsing_candidate(self, url_to_crawl):
         # replace shebang is urls
         final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
                     if '#!' in url_to_crawl else url_to_crawl
-        link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
+        link_hash = '%s.%s' % (hashlib.md5(final_url.encode('utf-8')).hexdigest(), time.time())
         return ParsingCandidate(final_url, link_hash)