diff --git a/backend-python/media_impact_monitor/fulltext_coding.py b/backend-python/media_impact_monitor/fulltext_coding.py index a9ab45c1..3c3a9ad1 100644 --- a/backend-python/media_impact_monitor/fulltext_coding.py +++ b/backend-python/media_impact_monitor/fulltext_coding.py @@ -4,7 +4,7 @@ from media_impact_monitor.util.llm import completion -system_prompt = "You're a sentiment analysis tool. For a given user input, always return the sentiment of the input. Return -1 for negative, 0 for neutral, and 1 for positive. Before you make your decision, reason about the decision." +system_prompt = """You're a sentiment analysis tool. For a given user input, always return the sentiment of the input. Return -1 for negative, 0 for neutral, and 1 for positive. Before you make your decision, reason about the decision. Stick exactly to the specified JSON schema including the "sentiment_reasoning" and "sentiment" fields.""" tools = [ { diff --git a/backend-python/media_impact_monitor/fulltexts.py b/backend-python/media_impact_monitor/fulltexts.py index 65bf822c..a03770a3 100644 --- a/backend-python/media_impact_monitor/fulltexts.py +++ b/backend-python/media_impact_monitor/fulltexts.py @@ -28,9 +28,17 @@ def get_fulltexts(q: FulltextSearch) -> pd.DataFrame | None: assert q.topic or q.organizers or q.query or q.event_id keywords = load_keywords() + num_filters = sum( + [bool(q.topic), bool(q.organizers), bool(q.query), bool(q.event_id)] + ) + if num_filters > 1: + raise ValueError( + "Only one of 'topic', 'organizers', 'query', 'event_id' is allowed." + ) if q.topic: - assert q.topic == "climate_change" - assert not q.query and not q.organizers and not q.event_id + assert ( + q.topic == "climate_change" + ), "Only 'climate_change' is supported as topic." query = xs( keywords["climate_science"] + keywords["climate_policy"] @@ -38,16 +46,13 @@ def get_fulltexts(q: FulltextSearch) -> pd.DataFrame | None: q.media_source, ) if q.organizers: - assert not q.topic and not q.query and not q.event_id for org in q.organizers: assert org in climate_orgs, f"Unknown organization: {org}" orgs = add_quotes(add_aliases(q.organizers)) query = xs_with_ys(orgs, keywords["activism"], q.media_source) if q.query: - assert not q.topic and not q.organizers and not q.event_id query = q.query if q.event_id: - assert not q.topic and not q.query and not q.organizers events = get_events_by_id([q.event_id]) assert len(events) == 1 event = events.iloc[0] diff --git a/backend-python/media_impact_monitor/fulltexts_test.py b/backend-python/media_impact_monitor/fulltexts_test.py index 76b82d47..054b767a 100644 --- a/backend-python/media_impact_monitor/fulltexts_test.py +++ b/backend-python/media_impact_monitor/fulltexts_test.py @@ -39,9 +39,32 @@ def test_get_fulltexts_for_event(): assert (texts["date"] <= date(2024, 5, 18)).all() -# def test_get_mediacloud_fulltexts(): -# start_date = date(2024, 5, 20) -# query = '"letzte generation"' -# fulltexts = get_mediacloud_fulltexts( -# query=query, start_date=start_date, countries=["Germany"] -# ) +def test_get_fulltexts_with_too_many_params(): + with pytest.raises(ValueError) as e: + get_fulltexts( + FulltextSearch( + media_source="news_online", + topic="climate_change", + start_date=date(2023, 1, 1), + end_date=date(2024, 1, 31), + event_id="adb689988aa3e61021da64570bda6d95", + ) + ) + assert ( + str(e.value) + == "Only one of 'topic', 'organizers', 'query', 'event_id' is allowed." + ) + + +def test_get_fulltexts_for_climate_change(): + texts = get_fulltexts( + FulltextSearch( + media_source="news_online", + topic="climate_change", + start_date=date(2023, 1, 1), + end_date=date(2023, 1, 2), + ) + ) + assert texts is not None + assert len(texts) > 0 + assert all(date(2023, 1, 1) <= text.date <= date(2023, 1, 2) for text in texts) diff --git a/backend-python/media_impact_monitor/types_.py b/backend-python/media_impact_monitor/types_.py index a1f043cc..293b8c9f 100644 --- a/backend-python/media_impact_monitor/types_.py +++ b/backend-python/media_impact_monitor/types_.py @@ -156,6 +156,10 @@ class PolicySearch(BaseModel): class FulltextSearch(BaseModel): + """ + You can set parameters for media_source and date_range, and filter by one of the following: topic, organizers, query, or event_id. For now you cannot combine the latter filters, since they all affect the query in different ways. + """ + media_source: MediaSource = Field( description="The data source for the media data (i.e., online news, print news, etc.)." )