Skip to content

Statistics

Victor Lin edited this page Feb 21, 2021 · 3 revisions

number of biosamples: 16,452,587

$ curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=biosample" 
...
	<DbName>biosample</DbName>
	<DbBuild>Build210216-1946m.1</DbBuild>
	<Count>16452587</Count>
	<LastUpdate>2021/02/16 23:12</LastUpdate>

potential coverage after geocoding: 31.15%

select x.count::decimal/y.count
from (
    select count(*) from biosample5
    where not (geo_coord_extracted is null and geo_text_extracted is null)
) x
join (
    select count(*) from biosample5
) y on 1=1

potential Serratus coverage after geocoding: 41.71%

-- # serratus sra with potential geospatial data: 2410955
select count(*) from (
    select bio_sample from srarun
  intersect
    select biosample_id from biosample5
    where not (geo_coord_extracted is null and geo_text_extracted is null)
) as q
inner join srarun
on (q.bio_sample = srarun.bio_sample)

-- # serratus sra: 5780798
select count(*) from ( select run from srarun group by run ) as q

biosamples to be geocoded: 2,490,233

select count(*) from biosample5
where geo_coord_extracted is null and geo_text_extracted is not null

size of text domain to be geocoded: 49,578

select count(distinct geo_text_extracted) from biosample2