From 849a05bfcd3be7a078a72a5ecaa8475429ae66d1 Mon Sep 17 00:00:00 2001 From: Jack Gibson Date: Sat, 25 May 2024 20:06:46 -0500 Subject: [PATCH] create dictionary of titles --- civiclens/nlp/pipeline.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/civiclens/nlp/pipeline.py b/civiclens/nlp/pipeline.py index dfbffbf5..ab966657 100644 --- a/civiclens/nlp/pipeline.py +++ b/civiclens/nlp/pipeline.py @@ -66,15 +66,15 @@ def get_last_update(): def docs_have_titles(): """Gets all docs that have nlp titles already""" - titles_query = """SELECT document_id + titles_query = """SELECT document_id, doc_plain_english_title FROM regulations_nlpoutput WHERE doc_plain_english_title IS NOT NULL""" db_title = Database() docs_with_titles = pull_data( - query=titles_query, connection=db_title, schema=["document_id"] + query=titles_query, connection=db_title, return_type="list" ) - docs_with_titles = docs_with_titles["document_id"].to_list() - return docs_with_titles + + return dict(docs_with_titles) if __name__ == "__main__": @@ -89,7 +89,7 @@ def docs_have_titles(): else: args = parser.parse_args() last_updated = get_last_update() - docs_with_titles = docs_have_titles() + doc_titles = docs_have_titles() # what docs need comment nlp update if last_updated is not None: docs_to_update = f"""SELECT document_id @@ -101,7 +101,7 @@ def docs_have_titles(): SELECT COUNT(*) FROM regulations_comment rc2 WHERE rc2.document_id = rc1.document_id - );""" # noqa: E702, E231, E241 + );""" # noqa: E702, E231, E241, E202 else: docs_to_update = """SELECT document_id FROM regulations_comment rc1 @@ -125,11 +125,13 @@ def docs_have_titles(): comment_data = RepComments(document_id=doc_id) comment_data.summary = titles.get_doc_summary(id=doc_id)[0, "summary"] - if (doc_id not in docs_with_titles and comment_data.summary) or ( + if (doc_id not in doc_titles and comment_data.summary) or ( args.refresh and comment_data.summary ): new_title = title_creator.invoke(paragraph=comment_data.summary) comment_data.doc_plain_english_title = new_title + else: + comment_data.doc_comments = doc_titles[doc_id] # do rep comment nlp comment_df = get_doc_comments(doc_id)