Skip to content

Commit

Permalink
Merge pull request #5 from ntluong95/causal-chain-refactor
Browse files Browse the repository at this point in the history
Causal chain refactor
  • Loading branch information
ntluong95 authored Nov 15, 2024
2 parents 96409c7 + e3d33cd commit ec44232
Show file tree
Hide file tree
Showing 18 changed files with 6,194 additions and 235 deletions.
1 change: 1 addition & 0 deletions .Renviron
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
RETICULATE_PYTHON=.venv
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ notebooks/.env
/_build/
CRE_workshop_materials.zip
/index_files/
.venv
.venv.Renviron
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12.4
185 changes: 185 additions & 0 deletions app.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
pacman::p_load(
rio,
here,
igraph,
tidygraph,
ggraph,
janitor,
bslib,
shiny,
tidyverse,
viridis
)

result_df <- import(here("notebooks", "cosine_test_31 Oct.csv"))

ui <- page_sidebar(
title = "Network Analysis Visualization",
sidebar = sidebar(
selectInput("centrality_method", "Centrality Measure:",
choices = c("Degree" = "degree",
"Betweenness" = "betweenness",
"Closeness" = "closeness"),
selected = "betweenness"),
selectInput("community_method", "Community Detection:",
choices = c("Cosine Similarity" = "cosine",
"Infomap" = "infomap",
"Label Propagation" = "label_prop",
"Leading Eigenvector" = "leading_eigen"),
selected = "infomap"),
selectInput("layout_type", "Graph Layout:",
choices = c("Fruchterman-Reingold" = "fr",
"Kamada-Kawai" = "kk",
"Stress" = "stress",
"Circle" = "circle",
"Grid" = "grid")),
numericInput("small_cluster_threshold",
"Small Cluster Threshold:",
value = 5,
min = 1),
hr(),
helpText("Select different methods to analyze network centrality and community structure")
),

card(
card_header("Network Visualization"),
plotOutput("network_plot", height = "800px")
)
)

server <- function(input, output, session) {

get_cluster_colors <- function(n) {
viridis(n, option = "D")
}

network_data <- reactive({
# First create nodes dataframe with cluster information
nodes_df <- result_df %>%
select(Cause_category, Cause_cluster) %>%
rename(name = Cause_category, cluster = Cause_cluster) %>%
bind_rows(
result_df %>%
select(Effect_category, Effect_cluster) %>%
rename(name = Effect_category, cluster = Effect_cluster)
) %>%
# Keep original name and cluster before transformation
mutate(
original_name = name,
original_cluster = cluster,
name = case_when(
str_detect(name, "transmission|spread") ~ "disease transmission",
TRUE ~ name
)
) %>%
# Group by the new name and keep all unique clusters
group_by(name) %>%
mutate(
all_clusters = paste(sort(unique(original_cluster)), collapse = "_")
) %>%
ungroup() %>%
# Keep one row per unique name-cluster combination
distinct(name, all_clusters, .keep_all = TRUE) %>%
filter(!name %in% c("No content", "No context"))

# Create edges dataframe
edges_df <- result_df %>%
select(Cause_category, Effect_category) %>%
filter(!Cause_category %in% c("No content", "No context")) %>%
mutate(across(c(Cause_category, Effect_category),
~case_when(
str_detect(., "transmission|spread") ~ "disease transmission",
TRUE ~ .
)))

# Create graph with node attributes
network_recode1 <- tbl_graph(nodes = nodes_df,
edges = edges_df,
directed = TRUE)

# Calculate centrality and community
network_recode1 <- network_recode1 %>%
mutate(
centrality = case_when(
input$centrality_method == "degree" ~ centrality_degree(),
input$centrality_method == "betweenness" ~ centrality_betweenness(),
input$centrality_method == "closeness" ~ centrality_closeness()
),
community = case_when(
input$community_method == "cosine" ~ as.factor(all_clusters),
input$community_method == "infomap" ~ as.factor(group_infomap()),
input$community_method == "label_prop" ~ as.factor(group_label_prop()),
input$community_method == "leading_eigen" ~ as.factor(group_leading_eigen())
)
)

# Only apply small cluster threshold for non-cosine methods
if(input$community_method != "cosine") {
network_recode1 <- network_recode1 %>%
mutate(
community = if_else(
community %in% names(which(table(community) <= input$small_cluster_threshold)),
"Unclassified",
"Cluster"
),
community = as.factor(community)
)
}

network_recode1 %>%
activate(edges) %>%
mutate(edge_type = if_else(.N()$centrality[from] > 3,
"link to endpoint",
"link between drivers"))
})

cluster_colors <- reactive({
g <- network_data()

if(input$community_method == "cosine") {
unique_communities <- sort(unique(as.character(V(g)$community)))
n_communities <- length(unique_communities)
colors <- get_cluster_colors(n_communities)
names(colors) <- unique_communities
colors
} else {
c("Unclassified" = "#1b9e77", "Cluster" = "#7570b3")
}
})

output$network_plot <- renderPlot({
g <- network_data()

centrality_scaled <- scales::rescale(V(g)$centrality, to = c(0.3, 1))

ggraph(g, layout = input$layout_type) +
geom_edge_link(aes(color = edge_type),
arrow = arrow(length = unit(4, 'mm'),
type = "closed"),
end_cap = circle(2, 'mm'),
alpha = 0.7,
edge_width = 1) +
geom_node_point(aes(size = centrality,
color = community),
show.legend = TRUE) +
geom_node_text(aes(label = name,
size = centrality,
alpha = centrality),
repel = TRUE) +
scale_color_manual(values = cluster_colors(),
name = "Cluster") +
scale_edge_color_manual(values = c("link between drivers" = "gray",
"link to endpoint" = "#d95f02")) +
scale_size(range = c(3, 10)) +
scale_alpha(range = c(0.3, 1)) +
theme_graph() +
labs(title = paste("Network Analysis using",
tools::toTitleCase(input$centrality_method),
"centrality and",
tools::toTitleCase(input$community_method),
"community detection")) +
guides(alpha = "none")
})
}

shinyApp(ui, server)
275 changes: 263 additions & 12 deletions data/corpus.csv

Large diffs are not rendered by default.

Binary file added data/eppocodes.sqlite
Binary file not shown.
85 changes: 85 additions & 0 deletions data/prompt_human_animal.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
1. {Persona and task description}
You are an epidemiologist tasked with identifying sentences or phrases from outbreak reports that describe the drivers or contributors to the emergence or transmission of emerging pests and pathogens

2. {Domain localization, terms’ definition}

Drivers: underlying socio-economic, environmental, or ecological forces that create conditions favourable for how a disease emerges, spreads or sustains transmission in human, animals or plants.
Pressure: human anthropogenic activities that are mainly responsible for the chances of spillover events and the transmission of pests and pathogens
State: the current circulation of pests and pathogens, represented as either new case detected, an endemic, an epidemic or a pandemic
Impacts: the effects caused by pests and pathogens on individual, community's socio-economic, and political
Responses: the actions and interventions taken by governments to manage the occurrence of drivers and pressures, and to control the spread of pests and pathogens and to mitigate the impacts


Causality definition: In the reports, causality can take two forms. The first form is "Intra-sentence causality", the “cause” and the “effect” lie in a single sentence while in "Inter-sentence causality", the “cause” and the “effect” lie in different sentences.

3. {Few-shot examples describing how drivers can be reported}: Denoting: (C): Cause, (E): Effect


- Single cause, single effect (Type 1)

Example 1: (C1) High population density and mobility in urban areas (C1) have facilitated (E1) the rapid spread of the virus (E1)".

Example 2: There is (C1) no vaccine for Influenza A(H1N1)v infection currently licensed for use in humans (C1). Seasonal influenza vaccines against human influenza viruses are generally not expected to protect people from (E1) infection with influenza viruses (E1) that normally circulate in pigs, but they can reduce severity.


- Single cause, multiple effects (Type 2)

Example 3: Several countries including Cameroon, Ethiopia, Haiti, Lebanon, Nigeria (north-east of the country), Pakistan, Somalia, Syria and the Democratic Republic of Congo (eastern part of the country) are in the midst of complex (C1) humanitarian crises (C1) with (E1) fragile health systems (E1), (E1) inadequate access to clean water and sanitation (E1) and have (E1) insufficient capacity to respond to the outbreaks (E1)

- Multiple causes, single effect (Type 3)
Example 4: Moreover, (C1) a low index of suspicion (C1), (C1) socio-cultural norms (C1), (C1) community resistance (C1), (C1) limited community knowledge regarding anthrax transmission (C1), (C1) high levels of poverty (C1) and (C1) food insecurity (C1), (C1) a shortage of available vaccines and laboratory reagents (C1), (C1) inadequate carcass disposal (C1) and (C1) decontamination practices (C1) significantly contribute to hampering (E1) the containment of the anthrax outbreak (E1).

Example 5:
The (E1) risk at the national level (E1) is assessed as ‘High’ due to the following:
+ In other parts of Timor-Leste (C1) health workers have limited knowledge dog bite and scratch case management (C1) including PEP and RIG administration
+ (C2) Insufficient stock of human rabies vaccines (C2) in the government health facilities.

- Multiple causes, multiple effects (Type 4) - Chain of causalities
The text may describe a chain of causality, where one effect becomes then the cause of another effect. To describe the chain, you should number the causes and effects. For example, cause 1 (C1) -> effect 1 (E1), but since effect 1 is also cause of effect 2, you should do cause 1 (C1) -> effect 1 (E1, C2) -> effect 2 (E2).

Example 6: (E2) The risk of insufficient control capacities (E2) is considered high in Zambia due to (C1) concurrent public health emergencies in the country (cholera, measles, COVID-19) (C1) that limit the country’s human and (E1, C2) financial capacities to respond to the current anthrax outbreak adequately (E1, C2).

Example 7: (C1) Surveillance systems specifically targeting endemic transmission of chikungunya or Zika are weak or non-existent (C1) -> (E1, C2) Misdiagnosis between diseases & Skewed surveillance (E1, C2) -> (E2, C3) Misinform policy decisions (E2, C3) -> (E3)reduced accuracy on the estimation of the true burden of each diseases (E3), poor risk assessments (E3), and non optimal clinical management and resource allocation (E3).

Example 8: (C1) Changes in the predominant circulating serotype (C1) -> (E1, C2) increase the population risk of subsequent exposure to a heterologous DENV serotype (E1, C2), -> (E2) which increases the risk of higher rates of severe dengue and deaths (E2).

4. {Negative cases}
Some sentences contain causal relationships, but the effect may not be related to the disease transmission or emergence. Avoid classifying those causal relationships.

Example 1 (no causality): Because these viruses continue to be detected in swine populations worldwide, further human cases following direct or indirect contact with infected swine can be expected.

Example 2 (no relevant causality): There is some (E1) pressure on the healthcare capacity (E1) due to the (C1) very high number of admissions for dengue (C1); (C1) high vector density (C1); and an (C1) anticipated prolonged monsoon (C1).

Example 3 (no relevant causality): (C1) MVD is a highly virulent disease (C1) that can cause (E1) haemorrhagic fever (E1) and is clinically similar to Ebola virus disease.

5. {Mechanism of causality}
When the text describes/list possible mechanisms behind the cause of transmission or emergence, tag them with (M). A mechanism of causality describes the specific interactions between the pathogen, host, and environment that causes the transmission / emergence. They often describe interactions at the physiological level.

Example 1: The global outbreak 2022 — 2024 has shown that (C1) sexual contact (C1) enables faster and more efficient (E1) spread of the virus (E1) from one person to another due to (M1) direct contact of mucous membranes between people (M1), (M1) contact with multiple partners (M1), (M1) a possibly shorter incubation period on average (M1), and (M1) a longer infectious period for immunocompromised individuals (M1).

6. {Sign of causality}

For each cause-effect relationship, indicate whether each cause (C) is positive (C+) or negative (C-) and each effect (E) is positive (E+) or negative (E-).
Use the list of positive and negative sign words provided to help determine the sign of each cause and effect. Be mindful of sentences with negations (e.g., “does not improve”), which reverses polarity.
Positive sign words: increase, facilitate, support, improve, expand, promote, enable, enhance, accelerate, advance, grow, boost, strengthen, benefit, contribute, progress, initiate, develop, elevate, stimulate, alleviate, optimize, revitalize.
Negative sign words: limit, decrease, reduce, hamper, hinder, restrict, suppress, impair, inhibit, undermine, challenge, disrupt, lack, insufficient, incomplete, challenge, deficit, obstacle, barrier, diminish, shortage, scarcity, obstruct, worsen, decline.

Example 1: “(C1-) a lack of timely access to diagnostics in many areas (C1-), (C1-) incomplete epidemiological investigations (C1-), (C1-) challenges in contact tracing and extensive but inconclusive animal investigations (C1-) continue to hamper rapid response (E1-)”

Example 2: Moreover, (C1-) a low index of suspicion (C1-), (C1) socio-cultural norms (C1), (C1) community resistance (C1), (C1-) limited community knowledge regarding anthrax transmission (C1-), (C1+) high levels of poverty (C1+) and (C1) food insecurity (C1), (C1-) a shortage of available vaccines and laboratory reagents (C1-), (C1-) inadequate carcass disposal (C1-) and (C1) decontamination practices (C1) significantly contribute to hampering (E1-) the containment of the anthrax outbreak (E1-).
















Loading

0 comments on commit ec44232

Please sign in to comment.