From 19ea3ca641ff4b8be54c16dc565361ef32a0e9f7 Mon Sep 17 00:00:00 2001 From: abhijitsinha Date: Sun, 8 Oct 2017 20:46:28 +0530 Subject: [PATCH] master: minor code arrangement. don't like using global variables so much so changed them to pure functions. also moved all the test sentences to another file. created a separate directory for data files. --- .gitignore | 4 +++ data/sample_sentences.txt | 34 ++++++++++++++++++ date_extractor/date_extractor.py | 61 ++++++++++++++++---------------- date_extractor/date_utils.py | 11 +++--- samples.py | 26 ++++---------- 5 files changed, 80 insertions(+), 56 deletions(-) create mode 100644 data/sample_sentences.txt diff --git a/.gitignore b/.gitignore index 7bbc71c..4ea8339 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,7 @@ ENV/ # mypy .mypy_cache/ + + +#output file +dates.json diff --git a/data/sample_sentences.txt b/data/sample_sentences.txt new file mode 100644 index 0000000..73a329d --- /dev/null +++ b/data/sample_sentences.txt @@ -0,0 +1,34 @@ +it was 25th may, 1990. +it was may 25th, 1990 +it was 25th may +the year was 1990 +in 1990, it happened +it was just may 25 +1st june +2nd june +The main headquarters of NATO is located on Boulevard L\u00e9opold III/Leopold III-laan, B-1110 Brussels, which is in Haren, part of the City of Brussels municipality. +from 1990-2000, its all ok +i am planning to go to my hometown on july 2nd. +born on 2 october 1869. on 12th dec that year, it rained + +Born and raised in the Austrian Empire, Tesla received an advanced education in engineering and physics in the 1870s and gained practical experience in the early 1880s working in telephony and at Continental Edison in the new electric power industry. +He emigrated to the United States in 1884, where he would become a naturalized citizen. +In 1893, he made pronouncements on the possibility of wireless communication with his devices. +Tesla died in New York City in January 1943. +His work fell into relative obscurity following his death, but in 1960, the General Conference on Weights and Measures named the SI unit of magnetic flux density the tesla in his honor. + +For his contributions to the development of quantum electrodynamics, Feynman, jointly with Julian Schwinger and Shin'ichirō Tomonaga, received the Nobel Prize in Physics in 1965. +In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World he was ranked as one of the ten greatest physicists of all time. +He assisted in the development of the atomic bomb during World War II and became known to a wide public in the 1980s as a member of the Rogers Commission, the panel that investigated the Space Shuttle Challenger disaster. +Feynman was a keen popularizer of physics through both books and lectures, including a 1959 talk on top-down nanotechnology called There's Plenty of Room at the Bottom, and the three-volume publication of his undergraduate lectures, The Feynman Lectures on Physics. + +During the final stage of World War II, the United States dropped nuclear weapons on the Japanese cities of Hiroshima and Nagasaki on August 6 and 9, 1945, respectively. +The war in Europe had concluded when Nazi Germany signed its instrument of surrender on May 8, 1945. +The Allies called for the unconditional surrender of the Imperial Japanese armed forces in the Potsdam Declaration on July 26, 1945—the alternative being "prompt and utter destruction". +By August 1945, the Allies' Manhattan Project had produced two types of atomic bomb, and the 509th Composite Group of the United States Army Air Forces (USAAF) was equipped with the specialized Silverplate version of the Boeing B-29 Superfortress that could deliver them from Tinian in the Mariana Islands. +Orders for atomic bombs to be used on four Japanese cities were issued on July 25. +On August 6, the U.S. dropped a uranium gun-type (Little Boy) bomb on Hiroshima, and American President Harry S. Truman called for Japan's surrender, warning it to "expect a rain of ruin from the air, the like of which has never been seen on this earth. +Three days later, on August 9, a plutonium implosion-type (Fat Man) bomb was dropped on Nagasaki. +Japan announced its surrender to the Allies on August 15, six days after the bombing of Nagasaki and the Soviet Union's declaration of war. +On September 2, the Japanese government signed the instrument of surrender, effectively ending World War II. The justification for the bombings of Hiroshima and Nagasaki is still debated to this day. + diff --git a/date_extractor/date_extractor.py b/date_extractor/date_extractor.py index 7cdc48f..1b63c8e 100644 --- a/date_extractor/date_extractor.py +++ b/date_extractor/date_extractor.py @@ -3,32 +3,25 @@ class DateExtractor(): - def __init__(self): - self._words = [] - self._potential_dates = [] - def extract_dates(self, text: str): - - # re-initialize the variables - self._words = [] - self._potential_dates = [] + def extract_dates(self, text): # split the text into words - self._words = re.findall(r"[\w']+", text) + words = re.findall(r"[\w']+", text) # tag the words as potential date/month/year - self._tag_words() + potential_dates = self._tag_words(words) # extract dates_to_return dates_to_return = [] processed_list = [] current_year = None - for i, curr in enumerate(self._potential_dates): + for i, curr in enumerate(potential_dates): if curr in processed_list: continue - n1 = self._get_next_valid_tag(i) - n2 = self._get_next_valid_tag(i + 1) + n1 = self._get_next_valid_tag(i, words, potential_dates) + n2 = self._get_next_valid_tag(i + 1, words, potential_dates) # (d, m, y) if curr['type'] == 'd' and n1 is not None and n1['type'] == 'm' and n2 is not None and n2['type'] == 'y': @@ -107,11 +100,23 @@ def extract_dates(self, text: str): return dates_to_return - def _tag_words(self): - for i, word in enumerate(self._words): - monthNo = DateTimeUtils.isMonth(word) + def _tag_words(self, words): + + potential_dates = []; + + for i, word in enumerate(words): + + if (i > 0): + prev_word = words[i - 1] + else: + prev_word = None + + dayNo = DateTimeUtils.getDate(word) + monthNo = DateTimeUtils.getMonth(word) + yearNo = DateTimeUtils.getYear(word, prev_word) + if monthNo is not None: - self._potential_dates.append({ + potential_dates.append({ 'index': i, 'word': word, 'type': 'm', @@ -119,35 +124,31 @@ def _tag_words(self): }) continue - if (i > 0): - prev_word = self._words[i - 1] - else: - prev_word = True - yearNo = DateTimeUtils.isYear(word, prev_word) if yearNo is not None: - self._potential_dates.append({ + potential_dates.append({ 'index': i, 'word': word, 'type': 'y', 'val': yearNo }) continue - dayNo = DateTimeUtils.isDate(word) + if dayNo is not None: - self._potential_dates.append({ + potential_dates.append({ 'index': i, 'word': word, 'type': 'd', 'val': dayNo }) continue + return potential_dates - def _get_next_valid_tag(self, index): - if index < len(self._potential_dates) - 1: - tag = self._potential_dates[index] + def _get_next_valid_tag(self, index, words, potential_dates): + if index < len(potential_dates) - 1: + tag = potential_dates[index] word_index = tag['index'] - next_word = self._words[word_index + 1] - next_tag = self._potential_dates[index + 1] + next_word = words[word_index + 1] + next_tag = potential_dates[index + 1] if next_tag['index'] == word_index + 1: return next_tag else: diff --git a/date_extractor/date_utils.py b/date_extractor/date_utils.py index 86df7f5..c10f383 100644 --- a/date_extractor/date_utils.py +++ b/date_extractor/date_utils.py @@ -29,7 +29,7 @@ class DateTimeUtils(): allowed_year_prefixes = ['in', 'on', 'year', 'late', 'early'] @staticmethod - def isMonth(word: str): + def getMonth(word: str): word = word.lower() if word in DateTimeUtils.months: return DateTimeUtils.months[word] @@ -37,8 +37,8 @@ def isMonth(word: str): return None @staticmethod - def isYear(word: str, prev_word): - if (prev_word == True + def getYear(word: str, prev_word): + if (prev_word is None or prev_word in DateTimeUtils.allowed_year_prefixes or prev_word.lower() in list(DateTimeUtils.months.keys())): if word.isnumeric() and int(word) < 2018 and int(word) > 1100: @@ -49,9 +49,8 @@ def isYear(word: str, prev_word): return None @staticmethod - def isDate(word): - if word.endswith('th') or word.endswith('st') or word.endswith( - 'nd') or word.endswith('rd'): + def getDate(word): + if word.endswith('st') or word.endswith('nd') or word.endswith('rd') or word.endswith('th'): word = word[:-2] if word.isnumeric() and int(word) < 32 and int(word) > 0: return int(word) diff --git a/samples.py b/samples.py index cead70f..9c8290b 100644 --- a/samples.py +++ b/samples.py @@ -4,30 +4,16 @@ def test_date_extract(): extractor = DateExtractor() - test_sentences = [] - test_sentences.append('it was 25th may, 1990.') - test_sentences.append('it was may 25th, 1990') - test_sentences.append('it was 25th may') - test_sentences.append('the year was 1990') - test_sentences.append('in 1990, it happened') - test_sentences.append('it was just may 25') - test_sentences.append('1st june') - test_sentences.append('2nd june') - test_sentences.append( - 'The main headquarters of NATO is located on Boulevard L\u00e9opold III/Leopold III-laan, B-1110 Brussels, which is in Haren, part of the City of Brussels municipality. ' - ) - test_sentences.append('from 1990-2000, its all ok') - test_sentences.append('i am planning to go to my hometown on july 2nd.') - test_sentences.append( - 'born on 2 october 1869. on 12th dec that year, it rained') + + infile = open('data/sample_sentences.txt', 'r') dates = [] - for sentence in test_sentences: + + for sentence in infile: d = extractor.extract_dates(sentence) dates = dates + d - print('done: ' + sentence + json.dumps(d)) + print('done:', sentence, json.dumps(d)) - with open('dates.json', 'a+') as outfile: + with open('data/dates.json', 'a+') as outfile: json.dump(dates, outfile) - test_date_extract()