Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Textminer normal mode #5

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,6 @@ docs/_build/

# PyBuilder
target/

.direnv/
.idea/
.envrc
61 changes: 61 additions & 0 deletions textminer/separator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re

def words(txt):
match = re.findall(r"\b[\d]*[A-Za-z-]+", txt)
if match:
return match

def phone_number(num):
match = re.match(r"\(?(?P<area_code>\d{3})[\.\)]?[.\- ]?(?P<first>\d{3})[.\-]?(?P<last>\d{4})", num)
if match:
temp_dict = match.groupdict()
phone_dict = {}
phone_dict["area_code"] = "{area_code}".format(**temp_dict)
phone_dict["number"] = "{first}-{last}".format(**temp_dict)
return phone_dict

def zipcode(num):
match = re.match(r"""(?P<zip>\d{5}) #five digit zipcode
(-(?P<plus4>\d{4}) #can also have dash
| #or
$ #has to end, no more number allowed
)""", num, re.VERBOSE)
if match:
return match.groupdict()


def dates(num):
match = re.match(r"""\A(?P<year>\d{4}) # YYYY
[/-]
(?P<month>\d{1,2}) #MM
[/-]
(?P<day>\d{2}) #DD
| #or
\A(?P<month2>\d{1,2}) # MM
[/-]
(?P<day2>\d{1,2}) #DD
[/-]
(?P<year2>\d{4}) #YYYY
""", num, re.VERBOSE)
if match:
match_dict = match.groupdict()
if match_dict.get('month2') != None:
match_dict['month'] = match_dict['month2']
match_dict['day'] = match_dict['day2']
match_dict['year'] = match_dict['year2']
del(match_dict['month2'])
del(match_dict['day2'])
del(match_dict['year2'])
match_dict['month'] = int(match_dict['month'])
match_dict['day'] = int(match_dict['day'])
match_dict['year'] = int(match_dict['year'])
return match_dict

#
# ("9/4/1976", {"month": 9, "day": 4, "year": 1976}),
# ("1976-09-04", {"month": 9, "day": 4, "year": 1976}),
# ("2015-01-01", {"month": 1, "day": 1, "year": 2015}),
# ("02/15/2004", {"month": 2, "day": 15, "year": 2004}),
# ("9/4", None),
# ("2015", None),
# ])
9 changes: 3 additions & 6 deletions textminer/tests/test_separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import textminer.separator as s

@xfail
@params("input,expected", [
("hello", ['hello']),
("hello world", ['hello', 'world']),
Expand All @@ -16,7 +15,6 @@ def test_words(input, expected):
assert s.words(input) == expected


@xfail
@params("input,expected", [
("919-555-1212", {"area_code": "919", "number": "555-1212"}),
("(919) 555-1212", {"area_code": "919", "number": "555-1212"}),
Expand Down Expand Up @@ -57,7 +55,6 @@ def test_money(input, expected):
assert s.money(input) == expected


@xfail
@params("input,expected", [
("63936", {"zip": "63936", "plus4": None}),
("50583", {"zip": "50583", "plus4": None}),
Expand All @@ -73,7 +70,7 @@ def test_zip(input, expected):
assert s.zipcode(input) == expected


@xfail

@params("input,expected", [
("9/4/1976", {"month": 9, "day": 4, "year": 1976}),
("1976-09-04", {"month": 9, "day": 4, "year": 1976}),
Expand All @@ -82,8 +79,8 @@ def test_zip(input, expected):
("9/4", None),
("2015", None),
])
def test_date(input, expected):
assert s.date(input) == expected
def test_dates(input, expected):
assert s.dates(input) == expected


## HARD MODE BEGINS
Expand Down
9 changes: 0 additions & 9 deletions textminer/tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import textminer.validator as v


@xfail
def test_binary_numbers():
assert v.binary("0")
assert v.binary("1")
Expand All @@ -15,7 +14,6 @@ def test_binary_numbers():
assert not v.binary("911")


@xfail
def test_binary_even():
"""String must be a binary number and be even."""

Expand All @@ -24,7 +22,6 @@ def test_binary_even():
assert not v.binary_even("1011")


@xfail
def test_hexadecimal():
assert v.hex("CAFE")
assert v.hex("9F9")
Expand All @@ -34,7 +31,6 @@ def test_hexadecimal():
assert not v.hex("COFFEE")


@xfail
def test_word():
assert v.word("hello")
assert v.word("wonderful")
Expand All @@ -47,7 +43,6 @@ def test_word():
assert not v.word("bar*us")


@xfail
def test_words():
"""words can take an optional count argument. In case it exists, the text
must match that number of words."""
Expand All @@ -69,7 +64,6 @@ def test_words():
assert not v.words("18-wheeler tarbox", count=3)


@xfail
def test_phone_numbers():
"""US phone numbers only."""

Expand All @@ -84,7 +78,6 @@ def test_phone_numbers():
assert not v.phone_number("mobile")


@xfail
def test_money():
"""We are just concerned with dollars here."""

Expand All @@ -110,7 +103,6 @@ def test_money():
assert not v.money("$$31")


@xfail
def test_zip():
assert v.zipcode("63936")
assert v.zipcode("50583")
Expand All @@ -125,7 +117,6 @@ def test_zip():
assert not v.zipcode("8029-3924")


@xfail
def test_date():
assert v.date("9/4/1976")
assert v.date("1976-09-04")
Expand Down
63 changes: 63 additions & 0 deletions textminer/validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re

def binary(num):
return re.match(r"[01]+", num)

def binary_even(num):
"""String must be a binary number and be even."""
return re.match(r"[01]+0$", num)

def hex(txt):
return re.match(r"\A[0-9A-Fa-f]+\Z", txt)
assert v.hex("CAFE")
assert v.hex("9F9")
assert v.hex("123")
assert v.hex("6720EB3A9D1")
assert not v.hex("")
assert not v.hex("COFFEE")

def word(txt):
return re.match(r"\A[\d]*[A-Za-z-]+\Z", txt)

def words(txt, count=0):
if count == 0:
return re.match(r"\A[\d]*[A-Za-z-\s]+\Z", txt)

else:
txt_list = txt.split()
if len(txt_list) != count:
return False
else:
return re.match(r"\A[\d]*[A-Za-z-\s]+\Z", txt)


def phone_number(num):
"""US phone numbers only."""
return re.match(r"\(?\d{3}[\.\)]?[.\- ]?\d{3}[.\-]?\d{4}", num)


def money(num):
"""We are just concerned with dollars here."""
return re.match(r"""\$ #find dollar sign
((\d{1,3} # one to three digits
(,\d{3})*) #find comma and 3 digits as many times
| #or
\d+)#zero or more digits
(\.\d{2})?$ #finds a decimal and zero or 1 digits
""", num, re.VERBOSE)

def zipcode(num):
return re.match(r"""\d{5} #five digit zipcode
(-\d{4} #can also have dash
| #or
$ #has to end, no more number allowed
)""", num, re.VERBOSE)


def date(num):
return re.match(r"""\A(\d{1}|\d{2}|\d{4})
[/-]
\d{1,2}
[/-]
(\d{2}|d{4})
""", num, re.VERBOSE)