diff --git a/README.md b/README.md index 43e5aab..7a2d88b 100644 --- a/README.md +++ b/README.md @@ -81,3 +81,23 @@ This is a python library that makes dealing with graphs super easy ### Pyvis [Pyvis python library](https://github.com/WestHealth/pyvis/tree/master) for visualisation. Pyvis generates Javascript Graph visualisations using python, so the final graphs can be hosted on the web. For example the [github link of this repo](https://rahulnyk.github.io/knowledge_graph/) is a graph generated by pyvis + +# Looking for contributions +This project needs a lot more work. There are some wonderful ideas suggested by folks on medium and here on Github. If this interests you, Please join hands and lets' build this together. Here is a list of ideas + +### Back End + +- [ ] Use embeddings to deduplicate semantically similar concepts (**Suggested by William Claude on the [Medium Article](https://medium.com/towards-data-science/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a)**) + - [ ] Avoid having similar concepts written differently by the LLM (eg: "doctor" and "doctors") + - [ ] Reinforce the clustering of strongly similar concepts (eg: "doctor" and "medical practitioner")? + +- [ ] Filter out the redundant, or outlier concepts that may not be useful in understanding the text. For example, generic concepts that occur too often in the text. (**Suggested by Luke Chesley**) + +- [ ] Better implement the concept of contextual proximity to avoide overweighting certain concepts that occur too frequently, or to weed out useless edges. (**Suggested by Luke Chesley**) + +### Front End +- [ ] Create a Frontend for rendering Graph of Concepts in a more useful way. for example here is a flow. (**Suggested by David Garcia on the [Medium Article](https://medium.com/towards-data-science/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a)**). + 1. Provide a list concept/interest/topics + 2. User selects what they're interested in + 3. This expands to show sub-topics, sub-concepts, sub-x, etc. + 4. This is how you get deep into a specialty diff --git a/extract_graph.ipynb b/extract_graph.ipynb index 8ae3497..5607989 100644 --- a/extract_graph.ipynb +++ b/extract_graph.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -39,14 +39,14 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 15.88it/s]" + "100%|██████████| 1/1 [00:00<00:00, 17.20it/s]" ] }, { @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -135,31 +135,31 @@ "
700 rows × 5 columns
\n", "" ], "text/plain": [ - " node_1 \\\n", - "2 56 articles \n", - "10 [54] \n", - "30 [55] \n", - "50 a bad situation \n", - "70 a worrisome new trend \n", - "... ... \n", - "2827 world-class health facilities \n", - "2828 world-class health facilities \n", - "2829 world-class health facilities \n", - "2830 world-class health facilities \n", - "2831 world-class health facilities \n", - "\n", - " node_2 \\\n", - "2 extensive literature search \n", - "10 increasing violence against healthcare personnel \n", - "30 increasing violence against healthcare personnel \n", - "50 increasing violence against healthcare personnel \n", - "70 increasing violence against healthcare personnel \n", - "... ... \n", - "2827 nhm strategies \n", - "2828 rural areas \n", - "2829 social norms \n", - "2830 urban areas \n", - "2831 urban slums \n", + " node_1 node_2 \\\n", + "2827 world-class health facilities nhm strategies \n", + "2828 world-class health facilities rural areas \n", + "2829 world-class health facilities social norms \n", + "2830 world-class health facilities urban areas \n", + "2831 world-class health facilities urban slums \n", "\n", " chunk_id count \\\n", - "2 d7a3e5085c7f4de4bc28fb0bd9cb0a94,d7a3e5085c7f4... 2 \n", - "10 640835e2521045a395ab6465cc1ba4ca,640835e252104... 2 \n", - "30 640835e2521045a395ab6465cc1ba4ca,640835e252104... 2 \n", - "50 640835e2521045a395ab6465cc1ba4ca,640835e252104... 2 \n", - "70 640835e2521045a395ab6465cc1ba4ca,640835e252104... 2 \n", - "... ... ... \n", "2827 0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4... 10 \n", "2828 0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4... 2 \n", "2829 0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4... 2 \n", @@ -544,22 +470,14 @@ "2831 0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4... 2 \n", "\n", " edge \n", - "2 contextual proximity \n", - "10 contextual proximity \n", - "30 contextual proximity \n", - "50 contextual proximity \n", - "70 contextual proximity \n", - "... ... \n", "2827 contextual proximity \n", "2828 contextual proximity \n", "2829 contextual proximity \n", "2830 contextual proximity \n", - "2831 contextual proximity \n", - "\n", - "[700 rows x 5 columns]" + "2831 contextual proximity " ] }, - "execution_count": 205, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -592,7 +510,7 @@ "\n", "\n", "dfg2 = contextual_proximity(dfg1)\n", - "dfg2" + "dfg2.tail()" ] }, { @@ -604,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -785,7 +703,7 @@ "[758 rows x 5 columns]" ] }, - "execution_count": 168, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -809,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -818,7 +736,7 @@ "(215,)" ] }, - "execution_count": 169, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -830,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -862,14 +780,15 @@ }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Number of Communities = 17\n" + "Number of Communities = 17\n", + "[['56 articles', 'analysis', \"corresponding authors' experiential knowledge\", 'extensive literature search', 'peer-reviewed journals'], ['[54]', '[55]', 'a bad situation', 'a worrisome new trend', 'adequately compensated', 'can reverse the situation', 'defensive medicine practices', 'increasing violence against healthcare personnel', 'intense focus on specialization', 'low physician-to-patient ratio', 'overwhelmed physicians', 'primary care physicians', 'private marketplace', 'protect themselves by ordering unnecessary tests and procedures', 'results in delays in attending patients', 'set in', 'tempted to take on more patients than they can reasonably serve', 'thoughtful approach to government planning', 'underpaid physicians', 'unethical practices by pharmaceutical companies', 'will not be able to solve this'], ['accredit health facilities', 'enforcement of existing rules', 'health insurance scheme for central government employees', 'health system standardization', 'implementation of land allocation conditions', 'medical tourism hub', 'new rules for reasonable costs and cap profit margins', 'private health sector regulation', 'private health sector utilization'], ['accurate data about the quantity and geospatial location of its manpower', \"central government's leadership in the fight for better health for all indians\", 'live register for health personnel and infrastructure'], ['adoption', 'affordable, accessible, quality care', 'clinical and social skills', 'corporate houses', 'deeper structural problems', 'doctor-to-population ratio', 'erratic posting of personnel', 'evaluations', 'for-profit private health sector', 'government', 'government-funded health sector', 'health infrastructure', 'health personnel', 'health system', 'implementation', 'individualized mentoring', 'medical tourism', 'narayana health', 'national health mission (nhm)', 'non-hierarchical work environment', 'online training', 'out-of-pocket (oop) expenditure', 'out-of-pocket expenditure', 'overarching', 'patients', 'personnel retention', 'physicians', 'policy', 'primary care', 'private health sector', 'providing ongoing training', 'public and private health sectors', 'public health sector', 'public hospital beds', 'quality care', 'quality of care', 'quantum of services provided', 'recommendations made here', 'regulation', 'retaining', 'service provision', 'services they have been trained for', 'sourcing health personnel', 'strengthening', 'tier ii and iii cities', 'training', 'training initiative', 'transparency', 'transparency in creating training schedules', 'uniform standards', 'unmet demand for healthcare services', 'upgrading skills of nurses', 'vulnerable populations', 'well-equipped personnel', 'work environment', \"world health organization's recommended\"], ['ai-embedded logarithms now diagnose covid-19 disease from chest x-rays and ct scans', 'covid-19 pandemic catalyzing process and enabling delivery of healthcare', 'doctors can now perform an ophthalmic fundal examination online', 'drones being used to deliver medicines to communities', 'expanding application of technology to other portfolios within the health sector', 'innovations in digital technology supporting delivery of vital healthcare in high-income countries', 'manufacturing protective equipment locally using 3d printing', 'remote orthopedic examinations being used successfully', 'telemedicine for consultations with healthcare providers', 'uptake of telemedicine during covid-19 pandemic'], ['allocated budget to healthcare', 'ashas', 'ayush doctors', 'ayushman bharat', 'cadres', 'communities', 'communitization', 'contractors', 'contracts', 'contractual employees', 'contractual personnel', 'delayed', 'demotivation', 'doctors', 'economic norms', 'epidemiology/public health', 'financially protected from catastrophic health expenses', 'flexible financing', 'gdp', 'government of india', 'health policy', 'health sector reform', 'healthcare facilities', 'high-income countries', 'improved management through capacity building', 'india', 'indian government', 'indian-born physicians', 'informal providers', 'infrastructure', 'innovations in human resource management', 'just over $30 billion', 'largest émigré physician workforce in the world', 'lower remuneration', 'monitoring progress against standards', 'nhm', 'nhm strategies', 'nurses', 'oop expenditures by consumers', 'other middle-income countries and its neighbors', 'poor', 'poor infrastructure', 'public health', 'public sector', 'publicly financed health insurance scheme for the poor', 'publicly financed purchasing of services from private providers', 'rural areas', 'salary payments', 'skilled health workforce', 'social norms', 'temporary nature', 'total expenditure on health in india', 'urban areas', 'urban slums', 'viability concerns', 'working conditions', 'world-class health facilities'], ['anm', 'asha', 'female community health workers', 'government employee', 'private contractor'], ['centralization', 'interplay between private corporate sector, pharmaceutical industries, medical education, and healthcare services', 'medical commission', 'medical education', 'national medical council (nmc)', 'nmc act', 'state', 'universal health coverage'], [\"constraints imposed by corporate healthcare sector on doctors' professional autonomy\", 'doctors gravitating towards private sector employment due to low government salaries', 'high fees of private sector medical colleges', 'higher number of medical colleges per population in india compared to other countries', 'highest employer of doctors in india', 'highly paid doctors graduating from private sector medical colleges', 'low fees of government medical colleges', 'majority of young and early career doctors facing erosion of status and opportunities in the private healthcare sector', \"performance targets and practice constraints on doctors' professional autonomy in the corporate healthcare sector\", 'private sector healthcare employment', 'private sector investment in medical colleges', 'profitable medical colleges', 'star doctors with flourishing practices in the private healthcare sector'], ['digital technology', 'evin', 'medical devices', 'online training management information systems', 'wearable, trackable technology'], ['doctors and nurses/midwives', 'health workers density'], ['federal', 'health budget'], [\"india's health indicators\", 'methodology used', 'national medical council', 'peer nations', 'private sector', 'public-private sector divide', 'read approach', 'recent increase in the federal health budget', 'skewed inter-state', 'skilled personnel', 'urban-rural', 'who recommended thresholds'], ['initial evaluation', 'primary health centers', 'rural medical assistants (rmas)'], ['limited uptake', 'national health protection mission'], ['private health sector systems', 'public']]\n" ] } ], @@ -878,7 +797,8 @@ "top_level_communities = next(communities_generator)\n", "next_level_communities = next(communities_generator)\n", "communities = sorted(map(sorted, next_level_communities))\n", - "print(\"Number of Communities = \", len(communities))" + "print(\"Number of Communities = \", len(communities))\n", + "print(communities)" ] }, { @@ -890,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -923,31 +843,31 @@ "