From 8ca4a1f5137ec65f01478e42fa86ea78dc296e35 Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Tue, 11 Aug 2020 13:39:44 -0400 Subject: [PATCH] Updated README and examples --- README.md | 2 +- ..._Embeddings_index_from_a_data_source.ipynb | 104 ++++++++--------- .../04_Extractive_QA_with_Elasticsearch.ipynb | 106 +++++++++--------- 3 files changed, 106 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index a83b33e35..0cafae9c1 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ The easiest way to install is via pip and PyPI pip install txtai -You can also install txtai directly from GitHub using pip. Using a Python Virtual Environment is recommended. +You can also install txtai directly from GitHub. Using a Python Virtual Environment is recommended. pip install git+https://github.com/neuml/txtai diff --git a/examples/03_Build_an_Embeddings_index_from_a_data_source.ipynb b/examples/03_Build_an_Embeddings_index_from_a_data_source.ipynb index fd052d2f5..74347795d 100644 --- a/examples/03_Build_an_Embeddings_index_from_a_data_source.ipynb +++ b/examples/03_Build_an_Embeddings_index_from_a_data_source.ipynb @@ -78,7 +78,7 @@ "base_uri": "https://localhost:8080/", "height": 228 }, - "outputId": "6bbacf83-d695-42ec-cce0-0f92b1534ca4" + "outputId": "5088c2f3-e47b-4026-a306-519b51858be8" }, "source": [ "!wget https://www.kaggleusercontent.com/kf/40510829/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FX7Ote_I-Y88MBPQHRIdUQ.tTr7P3B_eUL_yWN33Usz0Rk1KXtc4DjT_cdjkl5W4WbEcZ-0FJX2jSWTHYMVACtLYuMJJrf6eJN28OzWhDMnTysBu3wfDrd4ly5bu_wJKCnZajICQgQHs_b8hbRVMOzfdG6xEyl9CVYnZNU2cI3QuOshcWxoB0skdKD4d26O_Q4e_nrd8DqEixP47tI2Hu1F00w0vMykzgNwp7SwQ2Z9HoNCO8HtmcjEHq0A4lZ4303YkpjORtZQEO3S-j54fFlIAahT-9VvsFNofitK5VAlR0EyG9r3cOqh2LQDCL7kj5p3MxG8dvHmrTqggLVOwiuKHUIH8u59TemSMLsNRS29W-5fFlHfaItV4dEuiBxCIgQXHcKUDCDGEjeFcPgqpJnNHsnh0pebWDuRQR_fdQ-r8mWgN9qLnosrFBak9tM25G7gqxyUI90GMWAUyP4yj2EAEc8asX9rUsirC8QDHmrmOCUe0cmZvodRUi0ss7lTiLTwm55d9VPXjQn4jQ6tFs-dmjXEx0AwF2Mw1c1jhgzCXwgQj6ybUKemr_6wj1VFYj3VVvCXpk1nZObl-IB6-m7v5CIoXGLot_KFsVtyItRk-wX-B_L3W3aS9dOIfb7bX4s5_aNzXaDKvxrcafwlOQui.vS_FL4EArO8rkBo3xpDF2w/articles.sqlite" @@ -88,16 +88,16 @@ { "output_type": "stream", "text": [ - "--2020-08-11 15:42:03-- https://www.kaggleusercontent.com/kf/40510829/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FX7Ote_I-Y88MBPQHRIdUQ.tTr7P3B_eUL_yWN33Usz0Rk1KXtc4DjT_cdjkl5W4WbEcZ-0FJX2jSWTHYMVACtLYuMJJrf6eJN28OzWhDMnTysBu3wfDrd4ly5bu_wJKCnZajICQgQHs_b8hbRVMOzfdG6xEyl9CVYnZNU2cI3QuOshcWxoB0skdKD4d26O_Q4e_nrd8DqEixP47tI2Hu1F00w0vMykzgNwp7SwQ2Z9HoNCO8HtmcjEHq0A4lZ4303YkpjORtZQEO3S-j54fFlIAahT-9VvsFNofitK5VAlR0EyG9r3cOqh2LQDCL7kj5p3MxG8dvHmrTqggLVOwiuKHUIH8u59TemSMLsNRS29W-5fFlHfaItV4dEuiBxCIgQXHcKUDCDGEjeFcPgqpJnNHsnh0pebWDuRQR_fdQ-r8mWgN9qLnosrFBak9tM25G7gqxyUI90GMWAUyP4yj2EAEc8asX9rUsirC8QDHmrmOCUe0cmZvodRUi0ss7lTiLTwm55d9VPXjQn4jQ6tFs-dmjXEx0AwF2Mw1c1jhgzCXwgQj6ybUKemr_6wj1VFYj3VVvCXpk1nZObl-IB6-m7v5CIoXGLot_KFsVtyItRk-wX-B_L3W3aS9dOIfb7bX4s5_aNzXaDKvxrcafwlOQui.vS_FL4EArO8rkBo3xpDF2w/articles.sqlite\n", + "--2020-08-11 16:29:37-- https://www.kaggleusercontent.com/kf/40510829/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FX7Ote_I-Y88MBPQHRIdUQ.tTr7P3B_eUL_yWN33Usz0Rk1KXtc4DjT_cdjkl5W4WbEcZ-0FJX2jSWTHYMVACtLYuMJJrf6eJN28OzWhDMnTysBu3wfDrd4ly5bu_wJKCnZajICQgQHs_b8hbRVMOzfdG6xEyl9CVYnZNU2cI3QuOshcWxoB0skdKD4d26O_Q4e_nrd8DqEixP47tI2Hu1F00w0vMykzgNwp7SwQ2Z9HoNCO8HtmcjEHq0A4lZ4303YkpjORtZQEO3S-j54fFlIAahT-9VvsFNofitK5VAlR0EyG9r3cOqh2LQDCL7kj5p3MxG8dvHmrTqggLVOwiuKHUIH8u59TemSMLsNRS29W-5fFlHfaItV4dEuiBxCIgQXHcKUDCDGEjeFcPgqpJnNHsnh0pebWDuRQR_fdQ-r8mWgN9qLnosrFBak9tM25G7gqxyUI90GMWAUyP4yj2EAEc8asX9rUsirC8QDHmrmOCUe0cmZvodRUi0ss7lTiLTwm55d9VPXjQn4jQ6tFs-dmjXEx0AwF2Mw1c1jhgzCXwgQj6ybUKemr_6wj1VFYj3VVvCXpk1nZObl-IB6-m7v5CIoXGLot_KFsVtyItRk-wX-B_L3W3aS9dOIfb7bX4s5_aNzXaDKvxrcafwlOQui.vS_FL4EArO8rkBo3xpDF2w/articles.sqlite\n", "Resolving www.kaggleusercontent.com (www.kaggleusercontent.com)... 35.190.26.106\n", "Connecting to www.kaggleusercontent.com (www.kaggleusercontent.com)|35.190.26.106|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 8065024 (7.7M) [application/octet-stream]\n", "Saving to: ‘articles.sqlite’\n", "\n", - "\rarticles.sqlite 0%[ ] 0 --.-KB/s \rarticles.sqlite 100%[===================>] 7.69M --.-KB/s in 0.07s \n", + "\rarticles.sqlite 0%[ ] 0 --.-KB/s \rarticles.sqlite 100%[===================>] 7.69M --.-KB/s in 0.08s \n", "\n", - "2020-08-11 15:42:03 (109 MB/s) - ‘articles.sqlite’ saved [8065024/8065024]\n", + "2020-08-11 16:29:38 (101 MB/s) - ‘articles.sqlite’ saved [8065024/8065024]\n", "\n" ], "name": "stdout" @@ -130,7 +130,7 @@ "base_uri": "https://localhost:8080/", "height": 156 }, - "outputId": "67406a51-0819-4845-84da-63d0ad6677f1" + "outputId": "6bb1973b-b8c3-483c-83ce-e234360f48df" }, "source": [ "import os\n", @@ -175,9 +175,9 @@ "Building 300 dimension model\n", "Converting vectors to magnitude format\n", "total 9024\n", - "-rw-r--r-- 1 root root 8065024 Aug 11 15:42 articles.sqlite\n", - "-rw-r--r-- 1 root root 360448 Aug 11 15:43 cord19-300d.magnitude\n", - "-rw-r--r-- 1 root root 807886 Aug 11 15:43 cord19-300d.txt\n", + "-rw-r--r-- 1 root root 8065024 Aug 11 16:29 articles.sqlite\n", + "-rw-r--r-- 1 root root 360448 Aug 11 16:30 cord19-300d.magnitude\n", + "-rw-r--r-- 1 root root 807886 Aug 11 16:30 cord19-300d.txt\n", "drwxr-xr-x 1 root root 4096 Jul 30 16:30 sample_data\n" ], "name": "stdout" @@ -205,7 +205,7 @@ "base_uri": "https://localhost:8080/", "height": 52 }, - "outputId": "f00ead87-aeaa-4243-8b68-663c01f30520" + "outputId": "c0ed68b5-0ae2-4d05-eb49-7d02abc978e5" }, "source": [ "import sqlite3\n", @@ -281,7 +281,7 @@ "source": [ "# Query data\n", "\n", - "The following runs a query against the embeddings index for the terms \"comorbidities risk factors\". It finds the top 5 matches and returns the corresponding documents associated with each match." + "The following runs a query against the embeddings index for the terms \"risk factors\". It finds the top 5 matches and returns the corresponding documents associated with each match." ] }, { @@ -293,7 +293,7 @@ "base_uri": "https://localhost:8080/", "height": 293 }, - "outputId": "d0e9afef-f761-450c-ec9b-2724b6aa0922" + "outputId": "77225b0a-0bed-42f9-a850-ebc58629fc95" }, "source": [ "import pandas as pd\n", @@ -306,7 +306,7 @@ "cur = db.cursor()\n", "\n", "results = []\n", - "for uid, score in embeddings.search(\"comorbidities risk factors\", 5):\n", + "for uid, score in embeddings.search(\"risk factors\", 5):\n", " cur.execute(\"SELECT article, text FROM sections WHERE id = ?\", [uid])\n", " uid, text = cur.fetchone()\n", "\n", @@ -349,22 +349,22 @@ " The identification of risk factors for contracting COVID-19 is crucial, to inform public health policy and to facilitate the appropriate distribution of healthcare resources.\n", " \n", " \n", - " Does apolipoprotein E genotype predict COVID-19 severity?\n", - " 2020-04-27 00:00:00\n", - " https://doi.org/10.1093/qjmed/hcaa142\n", - " Of interest, apoE4 has also been associated with some of the comorbid risk factors associated with severe COVID-19, such as atherosclerosis and hypertension .\n", + " Quantitative evaluation of olfactory dysfunction in hospitalized patients with Coronavirus [2] (COVID-19)\n", + " 2020-05-25 00:00:00\n", + " https://www.ncbi.nlm.nih.gov/pubmed/32451613/\n", + " In addition, these reports included patients with minor COVID-19 symptoms and low-risk factor burden.\n", " \n", " \n", - " COVID-19 and associations with frailty and multimorbidity: a prospective analysis of UK Biobank participants\n", - " 2020-07-23 00:00:00\n", - " https://www.ncbi.nlm.nih.gov/pubmed/32705587/\n", - " Number of comorbidity groupings were then summed, and categorised (0–1; 2; 3 or ≥ 4 comorbidity groupings).\n", + " COVID-19 from the perspective of urban and rural general adult mental health services\n", + " 2020-05-21 00:00:00\n", + " https://doi.org/10.1017/ipm.2020.62\n", + " At-risk groups among staff members and service users were identified early and prioritised in service changes.\n", " \n", " \n", - " COVID-19 and associations with frailty and multimorbidity: a prospective analysis of UK Biobank participants\n", - " 2020-07-23 00:00:00\n", - " https://www.ncbi.nlm.nih.gov/pubmed/32705587/\n", - " Number of comorbidity groupings were then summed, and categorised (0-1; 2; 3 or ≥ 4 comorbidity groupings).\n", + " Management of osteoarthritis during COVID‐19 pandemic\n", + " 2020-05-21 00:00:00\n", + " https://doi.org/10.1002/cpt.1910\n", + " Consistently, a recent report indicated diabetes as a risk factor significantly associated with COVID-19 unfavourable clinical outcomes (37) .\n", " \n", " \n", "" @@ -417,30 +417,30 @@ "base_uri": "https://localhost:8080/", "height": 293 }, - "outputId": "15c337b5-7c6d-4b5e-98aa-e3996a799737" + "outputId": "d01afea0-63a1-4e8e-806b-c9f6702b43b9" }, "source": [ "db = sqlite3.connect(\"articles.sqlite\")\n", "cur = db.cursor()\n", "\n", "results = []\n", - "for uid, score in embeddings.search(\"comorbidities risk factors\", 5):\n", + "for uid, score in embeddings.search(\"risk factors\", 5):\n", " cur.execute(\"SELECT article, text FROM sections WHERE id = ?\", [uid])\n", " uid, text = cur.fetchone()\n", "\n", " # Get list of document text sections to use for the context\n", " cur.execute(\"SELECT Id, Name, Text FROM sections WHERE (labels is null or labels NOT IN ('FRAGMENT', 'QUESTION')) AND article = ?\", [uid])\n", " sections = []\n", - " for sid, name, text in cur.fetchall():\n", + " for sid, name, txt in cur.fetchall():\n", " if not name or not re.search(r\"background|(?Prevalence and Impact of Myocardial Injury in Patients Hospitalized with COVID-19 Infection\n", " 2020-04-24 00:00:00\n", " http://medrxiv.org/cgi/content/short/2020.04.20.20072702v1?rss=1\n", - " Stratified by Troponin Levels, N = 2736 All Patients All rights reserved.\n", - " no CVD, and neither CVD nor risk factors\n", - " Mount Sinai Health System\n", + " This risk was consistent across patients stratified by history of CVD, risk factors but no CVD, and neither CVD nor risk factors.\n", + " neither CVD nor risk factors\n", + " New York City hospitals\n", " \n", " \n", " COVID-19 and associations with frailty and multimorbidity: a prospective analysis of UK Biobank participants\n", " 2020-07-23 00:00:00\n", " https://www.ncbi.nlm.nih.gov/pubmed/32705587/\n", - " The age range at baseline within our sample was 40–69 years, but it is important to note that this was during 2006–2010, and the age range at COVID-19 diagnosis was substantially older (50–84 years).\n", + " The identification of risk factors for contracting COVID-19 is crucial, to inform public health policy and to facilitate the appropriate distribution of healthcare resources.\n", " Frailty and multimorbidity\n", - " None\n", + " hospital settings\n", " \n", " \n", - " Does apolipoprotein E genotype predict COVID-19 severity?\n", - " 2020-04-27 00:00:00\n", - " https://doi.org/10.1093/qjmed/hcaa142\n", - " If so, this group could be targeted more aggressively from the outset of the disease.\n", - " None\n", - " None\n", + " Quantitative evaluation of olfactory dysfunction in hospitalized patients with Coronavirus [2] (COVID-19)\n", + " 2020-05-25 00:00:00\n", + " https://www.ncbi.nlm.nih.gov/pubmed/32451613/\n", + " In addition, these reports included patients with minor COVID-19 symptoms and low-risk factor burden.\n", + " patients with minor COVID-19 symptoms and low-risk factor burden\n", + " COVID-19 wards\n", " \n", " \n", - " COVID-19 and associations with frailty and multimorbidity: a prospective analysis of UK Biobank participants\n", - " 2020-07-23 00:00:00\n", - " https://www.ncbi.nlm.nih.gov/pubmed/32705587/\n", - " The age range at baseline within our sample was 40–69 years, but it is important to note that this was during 2006–2010, and the age range at COVID-19 diagnosis was substantially older (50–84 years).\n", - " Frailty and multimorbidity\n", - " None\n", + " COVID-19 from the perspective of urban and rural general adult mental health services\n", + " 2020-05-21 00:00:00\n", + " https://doi.org/10.1017/ipm.2020.62\n", + " At-risk groups among staff members and service users were identified early and prioritised in service changes.\n", + " At-risk groups among staff members and service users\n", + " rural regions\n", " \n", " \n", - " COVID-19 and associations with frailty and multimorbidity: a prospective analysis of UK Biobank participants\n", - " 2020-07-23 00:00:00\n", - " https://www.ncbi.nlm.nih.gov/pubmed/32705587/\n", - " The age range at baseline within our sample was 40–69 years, but it is important to note that this was during 2006–2010, and the age range at COVID-19 diagnosis was substantially older (50–84 years).\n", - " Frailty and multimorbidity\n", + " Management of osteoarthritis during COVID‐19 pandemic\n", + " 2020-05-21 00:00:00\n", + " https://doi.org/10.1002/cpt.1910\n", + " Consistently, a recent report indicated diabetes as a risk factor significantly associated with COVID-19 unfavourable clinical outcomes (37) .\n", + " sex, obesity, genetic factors and mechanical factors\n", " None\n", " \n", " \n", diff --git a/examples/04_Extractive_QA_with_Elasticsearch.ipynb b/examples/04_Extractive_QA_with_Elasticsearch.ipynb index f8db614f8..ca3671c63 100644 --- a/examples/04_Extractive_QA_with_Elasticsearch.ipynb +++ b/examples/04_Extractive_QA_with_Elasticsearch.ipynb @@ -54,7 +54,7 @@ "!tar -xzf elasticsearch-7.8.1-linux-x86_64.tar.gz\n", "!chown -R daemon:daemon elasticsearch-7.8.1" ], - "execution_count": 16, + "execution_count": 1, "outputs": [] }, { @@ -72,7 +72,7 @@ "server = Popen(['elasticsearch-7.8.1/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))\n", "!sleep 30" ], - "execution_count": 17, + "execution_count": 2, "outputs": [] }, { @@ -98,26 +98,26 @@ "base_uri": "https://localhost:8080/", "height": 228 }, - "outputId": "7f46594f-b0f2-4994-dc0b-70e50fe132fc" + "outputId": "d7720f9c-8b2f-43d7-f0d4-9972d318cecb" }, "source": [ "!wget https://www.kaggleusercontent.com/kf/40510829/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FX7Ote_I-Y88MBPQHRIdUQ.tTr7P3B_eUL_yWN33Usz0Rk1KXtc4DjT_cdjkl5W4WbEcZ-0FJX2jSWTHYMVACtLYuMJJrf6eJN28OzWhDMnTysBu3wfDrd4ly5bu_wJKCnZajICQgQHs_b8hbRVMOzfdG6xEyl9CVYnZNU2cI3QuOshcWxoB0skdKD4d26O_Q4e_nrd8DqEixP47tI2Hu1F00w0vMykzgNwp7SwQ2Z9HoNCO8HtmcjEHq0A4lZ4303YkpjORtZQEO3S-j54fFlIAahT-9VvsFNofitK5VAlR0EyG9r3cOqh2LQDCL7kj5p3MxG8dvHmrTqggLVOwiuKHUIH8u59TemSMLsNRS29W-5fFlHfaItV4dEuiBxCIgQXHcKUDCDGEjeFcPgqpJnNHsnh0pebWDuRQR_fdQ-r8mWgN9qLnosrFBak9tM25G7gqxyUI90GMWAUyP4yj2EAEc8asX9rUsirC8QDHmrmOCUe0cmZvodRUi0ss7lTiLTwm55d9VPXjQn4jQ6tFs-dmjXEx0AwF2Mw1c1jhgzCXwgQj6ybUKemr_6wj1VFYj3VVvCXpk1nZObl-IB6-m7v5CIoXGLot_KFsVtyItRk-wX-B_L3W3aS9dOIfb7bX4s5_aNzXaDKvxrcafwlOQui.vS_FL4EArO8rkBo3xpDF2w/articles.sqlite" ], - "execution_count": 18, + "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ - "--2020-08-11 15:41:39-- https://www.kaggleusercontent.com/kf/40510829/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FX7Ote_I-Y88MBPQHRIdUQ.tTr7P3B_eUL_yWN33Usz0Rk1KXtc4DjT_cdjkl5W4WbEcZ-0FJX2jSWTHYMVACtLYuMJJrf6eJN28OzWhDMnTysBu3wfDrd4ly5bu_wJKCnZajICQgQHs_b8hbRVMOzfdG6xEyl9CVYnZNU2cI3QuOshcWxoB0skdKD4d26O_Q4e_nrd8DqEixP47tI2Hu1F00w0vMykzgNwp7SwQ2Z9HoNCO8HtmcjEHq0A4lZ4303YkpjORtZQEO3S-j54fFlIAahT-9VvsFNofitK5VAlR0EyG9r3cOqh2LQDCL7kj5p3MxG8dvHmrTqggLVOwiuKHUIH8u59TemSMLsNRS29W-5fFlHfaItV4dEuiBxCIgQXHcKUDCDGEjeFcPgqpJnNHsnh0pebWDuRQR_fdQ-r8mWgN9qLnosrFBak9tM25G7gqxyUI90GMWAUyP4yj2EAEc8asX9rUsirC8QDHmrmOCUe0cmZvodRUi0ss7lTiLTwm55d9VPXjQn4jQ6tFs-dmjXEx0AwF2Mw1c1jhgzCXwgQj6ybUKemr_6wj1VFYj3VVvCXpk1nZObl-IB6-m7v5CIoXGLot_KFsVtyItRk-wX-B_L3W3aS9dOIfb7bX4s5_aNzXaDKvxrcafwlOQui.vS_FL4EArO8rkBo3xpDF2w/articles.sqlite\n", + "--2020-08-11 16:52:19-- https://www.kaggleusercontent.com/kf/40510829/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FX7Ote_I-Y88MBPQHRIdUQ.tTr7P3B_eUL_yWN33Usz0Rk1KXtc4DjT_cdjkl5W4WbEcZ-0FJX2jSWTHYMVACtLYuMJJrf6eJN28OzWhDMnTysBu3wfDrd4ly5bu_wJKCnZajICQgQHs_b8hbRVMOzfdG6xEyl9CVYnZNU2cI3QuOshcWxoB0skdKD4d26O_Q4e_nrd8DqEixP47tI2Hu1F00w0vMykzgNwp7SwQ2Z9HoNCO8HtmcjEHq0A4lZ4303YkpjORtZQEO3S-j54fFlIAahT-9VvsFNofitK5VAlR0EyG9r3cOqh2LQDCL7kj5p3MxG8dvHmrTqggLVOwiuKHUIH8u59TemSMLsNRS29W-5fFlHfaItV4dEuiBxCIgQXHcKUDCDGEjeFcPgqpJnNHsnh0pebWDuRQR_fdQ-r8mWgN9qLnosrFBak9tM25G7gqxyUI90GMWAUyP4yj2EAEc8asX9rUsirC8QDHmrmOCUe0cmZvodRUi0ss7lTiLTwm55d9VPXjQn4jQ6tFs-dmjXEx0AwF2Mw1c1jhgzCXwgQj6ybUKemr_6wj1VFYj3VVvCXpk1nZObl-IB6-m7v5CIoXGLot_KFsVtyItRk-wX-B_L3W3aS9dOIfb7bX4s5_aNzXaDKvxrcafwlOQui.vS_FL4EArO8rkBo3xpDF2w/articles.sqlite\n", "Resolving www.kaggleusercontent.com (www.kaggleusercontent.com)... 35.190.26.106\n", "Connecting to www.kaggleusercontent.com (www.kaggleusercontent.com)|35.190.26.106|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 8065024 (7.7M) [application/octet-stream]\n", - "Saving to: ‘articles.sqlite.1’\n", + "Saving to: ‘articles.sqlite’\n", "\n", - "\rarticles.sqlite.1 0%[ ] 0 --.-KB/s \rarticles.sqlite.1 100%[===================>] 7.69M --.-KB/s in 0.1s \n", + "\rarticles.sqlite 0%[ ] 0 --.-KB/s \rarticles.sqlite 100%[===================>] 7.69M 47.2MB/s in 0.2s \n", "\n", - "2020-08-11 15:41:39 (77.5 MB/s) - ‘articles.sqlite.1’ saved [8065024/8065024]\n", + "2020-08-11 16:52:20 (47.2 MB/s) - ‘articles.sqlite’ saved [8065024/8065024]\n", "\n" ], "name": "stdout" @@ -145,7 +145,7 @@ "base_uri": "https://localhost:8080/", "height": 34 }, - "outputId": "8ccfed53-bc46-4619-9b82-31a30ce38b55" + "outputId": "999563c1-a40b-4be2-f5c6-0515423aec13" }, "source": [ "import sqlite3\n", @@ -196,7 +196,7 @@ "\n", "print(\"Total articles inserted: {}\".format(rows))\n" ], - "execution_count": 19, + "execution_count": 4, "outputs": [ { "output_type": "stream", @@ -216,7 +216,7 @@ "source": [ "# Query data\n", "\n", - "The following runs a query against Elasticsearch for the terms \"comorbidities risk factors\". It finds the top 5 matches and returns the corresponding documents associated with each match.\n", + "The following runs a query against Elasticsearch for the terms \"risk factors\". It finds the top 5 matches and returns the corresponding documents associated with each match.\n", "\n" ] }, @@ -229,7 +229,7 @@ "base_uri": "https://localhost:8080/", "height": 293 }, - "outputId": "5c3b9acf-25b9-4782-faff-a098da84916b" + "outputId": "27d45ae0-c90e-4fff-b2b1-9cb87301da81" }, "source": [ "import pandas as pd\n", @@ -242,7 +242,7 @@ " \"_source\": [\"article\", \"title\", \"published\", \"reference\", \"text\"],\n", " \"size\": 5,\n", " \"query\": {\n", - " \"query_string\": {\"query\": \"comorbidities risk factors\"}\n", + " \"query_string\": {\"query\": \"risk factors\"}\n", " }\n", "}\n", "\n", @@ -255,7 +255,7 @@ "\n", "display(HTML(df.to_html(index=False)))" ], - "execution_count": 20, + "execution_count": 5, "outputs": [ { "output_type": "display_data", @@ -272,34 +272,34 @@ " \n", " \n", " \n", - " Management of osteoarthritis during COVID‐19 pandemic\n", - " 2020-05-21 00:00:00\n", - " https://doi.org/10.1002/cpt.1910\n", - " Indeed, risk factors are sex, obesity, genetic factors and mechanical factors (3) .\n", + " Does apolipoprotein E genotype predict COVID-19 severity?\n", + " 2020-04-27 00:00:00\n", + " https://doi.org/10.1093/qjmed/hcaa142\n", + " Risk factors associated with subsequent death include older age, hypertension, diabetes, ischemic heart disease, obesity and chronic lung disease; however, sometimes there are no obvious risk factors .\n", " \n", " \n", - " Prevalence and Impact of Myocardial Injury in Patients Hospitalized with COVID-19 Infection\n", - " 2020-04-24 00:00:00\n", - " http://medrxiv.org/cgi/content/short/2020.04.20.20072702v1?rss=1\n", - " This risk was consistent across patients stratified by history of CVD, risk factors but no CVD, and neither CVD nor risk factors.\n", + " Early Safety Indicators of COVID-19 ConvalescentPlasma in 5,000 Patients\n", + " 2020-05-14 00:00:00\n", + " https://www.ncbi.nlm.nih.gov/pubmed/32511566/\n", + " TRALI often presents as bilateral pulmonary edema with little evidence of circulatory overload, and TRALI is further categorized into two types based on the absence of acute respiratory distress syndrome (ARDS) risk factors (type I) or presence of ARDS risk factors (type II) (18) .\n", " \n", " \n", - " Lower prevalence of antibodies neutralizing SARS-CoV-2 in group O French blood donors\n", - " 2020-07-15 00:00:00\n", - " https://www.ncbi.nlm.nih.gov/pubmed/32679056/\n", - " Age, sex and various comorbidities have been identified as factors worsening the prognosis of the disease.\n", + " Associations with covid-19 hospitalisation amongst 406,793 adults: the UK Biobank prospective cohort study\n", + " 2020-05-11 00:00:00\n", + " http://medrxiv.org/cgi/content/short/2020.05.06.20092957v1?rss=1\n", + " In addition, many risk factors for covid-19 documented in the literature are highly correlated and it is not clear which may be independently related to risk.\n", " \n", " \n", - " Does apolipoprotein E genotype predict COVID-19 severity?\n", - " 2020-04-27 00:00:00\n", - " https://doi.org/10.1093/qjmed/hcaa142\n", - " Risk factors associated with subsequent death include older age, hypertension, diabetes, ischemic heart disease, obesity and chronic lung disease; however, sometimes there are no obvious risk factors .\n", + " Associations with covid-19 hospitalisation amongst 406,793 adults: the UK Biobank prospective cohort study\n", + " 2020-05-11 00:00:00\n", + " http://medrxiv.org/cgi/content/short/2020.05.06.20092957v1?rss=1\n", + " The large numbers of covariables available in this cohort also enabled multivariable adjustment, permitting assessment of independent risk factors.\n", " \n", " \n", - " Risk Stratification for Healthcare workers during the CoViD-19 Pandemic; using demographics, co-morbid disease and clinical domain in order to assign clinical duties\n", - " 2020-05-09 00:00:00\n", - " http://medrxiv.org/cgi/content/short/2020.05.05.20091967v1?rss=1\n", - " A risk stratification tool was compiled using a Caucasian female <50years with no comorbidities as a reference.\n", + " Autoinflammatory and autoimmune conditions at the crossroad of COVID-19\n", + " 2020-06-16 00:00:00\n", + " https://doi.org/10.1016/j.jaut.2020.102506\n", + " In addition, personalized approaches including genotypification of risk genes and evaluation of risk factors for autoimmunity (i.e., familial autoimmunity) must be also considered in the current development of vaccines .\n", " \n", " \n", "" @@ -332,7 +332,7 @@ "# Create extractor instance using qa model designed for the CORD-19 dataset\n", "extractor = Extractor(embeddings, \"NeuML/bert-small-cord19qa\")" ], - "execution_count": 21, + "execution_count": 6, "outputs": [] }, { @@ -344,7 +344,7 @@ "base_uri": "https://localhost:8080/", "height": 328 }, - "outputId": "4e341551-5af7-492f-e5c9-1904f8bf3689" + "outputId": "521198fc-a54b-4038-8a1e-46b399eb63f8" }, "source": [ "document = {\n", @@ -377,7 +377,7 @@ "\n", " # Use QA extractor to derive additional columns\n", " answers = extractor(sections(source[\"article\"]), [(\"Risk factors\", \"risk factor\", \"What are names of risk factors?\", False),\n", - " (\"Locations\", \"city country state\", \"What are names of the locations?\", False)])\n", + " (\"Locations\", \"city country state\", \"What are names of locations?\", False)])\n", "\n", " results.append((source[\"title\"], source[\"published\"], source[\"reference\"], source[\"text\"]) + tuple([answer[1] for answer in answers]))\n", "\n", @@ -385,7 +385,7 @@ "\n", "display(HTML(df.to_html(index=False)))" ], - "execution_count": 22, + "execution_count": 7, "outputs": [ { "output_type": "display_data", @@ -409,7 +409,7 @@ " https://doi.org/10.1002/cpt.1910\n", " Indeed, risk factors are sex, obesity, genetic factors and mechanical factors (3) .\n", " Comorbidities\n", - " None\n", + " extrapulmonary sites\n", " \n", " \n", " Prevalence and Impact of Myocardial Injury in Patients Hospitalized with COVID-19 Infection\n", @@ -420,14 +420,6 @@ " None\n", " \n", " \n", - " Lower prevalence of antibodies neutralizing SARS-CoV-2 in group O French blood donors\n", - " 2020-07-15 00:00:00\n", - " https://www.ncbi.nlm.nih.gov/pubmed/32679056/\n", - " Age, sex and various comorbidities have been identified as factors worsening the prognosis of the disease.\n", - " Age, sex and various comorbidities\n", - " Paris\n", - " \n", - " \n", " Does apolipoprotein E genotype predict COVID-19 severity?\n", " 2020-04-27 00:00:00\n", " https://doi.org/10.1093/qjmed/hcaa142\n", @@ -436,12 +428,20 @@ " None\n", " \n", " \n", - " Risk Stratification for Healthcare workers during the CoViD-19 Pandemic; using demographics, co-morbid disease and clinical domain in order to assign clinical duties\n", - " 2020-05-09 00:00:00\n", - " http://medrxiv.org/cgi/content/short/2020.05.05.20091967v1?rss=1\n", - " A risk stratification tool was compiled using a Caucasian female <50years with no comorbidities as a reference.\n", - " Those working with aerosol generating procedures were at increased risk\n", - " None\n", + " COVID-19 and associations with frailty and multimorbidity: a prospective analysis of UK Biobank participants\n", + " 2020-07-23 00:00:00\n", + " https://www.ncbi.nlm.nih.gov/pubmed/32705587/\n", + " BACKGROUND: Frailty and multimorbidity have been suggested as risk factors for severe COVID-19 disease.\n", + " Frailty and multimorbidity\n", + " comorbidity groupings\n", + " \n", + " \n", + " COVID-19: what has been learned and to be learned about the novel coronavirus disease\n", + " 2020-03-15 00:00:00\n", + " https://doi.org/10.7150/ijbs.45134\n", + " • Three major risk factors for COVID-19 were sex (male), age (≥60), and severe pneumonia.\n", + " age and underlying disease are strongly correlated\n", + " cities, provinces, and countries\n", " \n", " \n", ""