From 58c1743ed3d5b04e025e9b52de38b472712b063e Mon Sep 17 00:00:00 2001 From: Jeremy Gresham Date: Wed, 27 May 2015 21:47:58 -0400 Subject: [PATCH] done --- textminer/extractor.py | 9 +++ textminer/separator.py | 140 +++++++++++++++++++++++++++++++++++++++++ textminer/validator.py | 78 +++++++++++++++++++++++ 3 files changed, 227 insertions(+) diff --git a/textminer/extractor.py b/textminer/extractor.py index e69de29..778921a 100644 --- a/textminer/extractor.py +++ b/textminer/extractor.py @@ -0,0 +1,9 @@ +import re + +phones = ['(\([0-9]{3}\))?[\s-]?([0-9]{3})[\s-]?([0-9])'] + +def phone_numbers(text): + return re.findall(r'\([0-9]{3}\)\s[0-9]{3}-[0-9]{4}', text) + +def emails(text): + return re.findall(r'[-_A-Za-z\.]+@[-A-Za-z\.]+', text) diff --git a/textminer/separator.py b/textminer/separator.py index e69de29..3c5ebae 100644 --- a/textminer/separator.py +++ b/textminer/separator.py @@ -0,0 +1,140 @@ +import re +import textminer.validator as v + +months = { + 'January':1, + 'Jan':1, + 'February':2, + 'Feb':2, + 'March':3, + 'April':4, + 'May':5, + 'June':6, + 'July':7, + 'August':8, + 'Aug':8, + 'September':9, + 'Sept':9, + 'October':10, + 'Oct':10, + 'November':11, + 'Nov':11, + 'December':12, + 'Dec':12 +} +days_month= { + 1:31, + 2:28, + 3:31, + 4:30, + 5:31, + 6:30, + 7:31, + 8:31, + 9:30, + 10:31, + 11:30, + 12:31 +} + +def words(text): + wrds = re.findall(r'\b\w*[A-Za-z-_]+\w*\b', text) + if wrds: + return wrds + return None + + +def phone_number(text): + result = re.match(r'\(?([0-9]{3})\)?[\s\.-]?([0-9]{3})[\.-]?([0-9]{4})', text) + if result: + result = result.groups() + return {'area_code': result[0], 'number': result[1] + '-' + result[2]} + return None + + +def money(text): + + result = re.match(r""" + ^(\${1})(\d{1,3},)((?:\d{3},)*)(\d{3})(\.\d{2})?$| + ^(\${1})([0-9]+)(\.\d{2})?$ + """, text, flags=re.X) + + if result: + result = "".join(item for item in result.groups() if item) + result = re.sub(",", "", result) + + return {'amount': float(result[1:]), 'currency': result[0]} + return None + + +def zipcode(text): + result = re.match(r'([0-9]{5})-([0-9]+)|([0-9]{5})\b', text) + + if result: + result = result.groups() + + if result[0]: + if result[1] and len(result[1]) == 4: + return {'zip': result[0], 'plus4': result[1]} + + return None + else: + return {'zip': result[2], 'plus4': None} + + return None + + +def date(text): + regs = ['^(?P[0-9]{1,2})[/-](?P[0-9]{1,2})[/-](?P[0-9]{4})$', + '^(?P[0-9]{4})[/-](?P[0-9]{1,2})[/-](?P[0-9]{1,2})$', + '^(?P[0-9]{4})\s+(?P[A-Za-z]+)\.?\s+(?P[0-9]{2})$', + '^(?P[A-Za-z]+)\.?\s+(?P[0-9]{1,2}),\s+(?P[0-9]{4})$', + ] + + result = [re.match(r'{}'.format(item), text) for item in regs] + result = [item for item in result if item] + + if result: + result = result[0].groupdict() + + if result['month'] in months.keys(): + result['month'] = months[result['month']] + + result = {item: int(value) for item, value in result.items() if value} + + if result['day'] > days_month[result['month']]: + return None + + return result + + return None + + +def email(text): + result = re.match(r'([A-Za-z0-9]+\.?[A-Za-z0-9]+)@([\w]+\.[A-Za-z]+)', text) + + if result: + result = result.groups() + return {'local': result[0], 'domain': result[1]} + return None + + +def address(text): + result = re.match(r"""(?P
[0-9]+(?:\s\w+\.?)*)[,\s]+ + (?P\w+\s?(?:\w+)?)[,\s]+(?P[A-Z]{2})[,\s]+ + (?P[0-9]+-?(?:[0-9]+)?)""" , text, re.X) + + if result: + + result = result.groupdict() + + if not [item for item, value in result.items() if value == None]: + if v.zipcode(result['zip']): + if v.words(result['city']): + zip_parts = zipcode(result['zip']) + for item, value in zip_parts.items(): + result[item] = value + + return result + + return None diff --git a/textminer/validator.py b/textminer/validator.py index e69de29..0cbc54d 100644 --- a/textminer/validator.py +++ b/textminer/validator.py @@ -0,0 +1,78 @@ +import re + + +def binary(text): + if re.match(r'^[01]+$', text): + return True + return False + + +def binary_even(text): + return int(text[-1]) == 0 + +def hex(text): + if re.match(r'^[0-9A-F]+$', text): + return True + return False + +def word(text): + return re.match(r'.+[A-Za-z]+.+', text) and not re.match(r'.+[^A-Za-z0-9-].+', text) + +def words(text, count=0): + + if text: + + text = re.findall(r'\b\w*[A-Za-z-_]+\w*\b', text) + + if not text: + return False + + notwords = [item for item in text if not word(item)] + + if notwords: + return False + + if (count != 0 and count == len(text)) or count == 0: + return True + + return False + + +def phone_number(text): + result = re.match(r'\(?([0-9]{3})\)?[\s\.-]?([0-9]{3})[\.-]?([0-9]{4})', text) + if result: + return True + return False + +def money(text): + result = re.match(r""" + ^(\${1})(\d{1,3},)((?:\d{3},)*)(\d{3})(\.\d{2})?$| + ^(\${1})([0-9]+)(\.\d{2})?$ + """, text, flags=re.X) + + if result: + return True + return False + +def zipcode(text): + result = re.match(r'([0-9]{5})-([0-9]{4})$|([0-9]{5})$', text) + + if result: + return True + return False + +def email(text): + result = re.findall(r'([A-Za-z0-9]+\.?[A-Za-z0-9]+)@([\w]+\.[A-Za-z]+)', text) + + if result: + return True + return False + +def address(text): + result = re.match(r"""(?P
[0-9]+(?:\s\w+\.?)*)[,\s]+ + (?P\w+\s?(?:\w+)?)[,\s]+(?P[A-Z]{2})[,\s]+ + (?P[0-9]+-?(?:[0-9]+)?)""" , text, re.X) + + if result: + return True + return False