From 6d15cc0f79956ab05aa38dda7e6ebfefa74fac18 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Fri, 19 Jan 2024 09:28:11 -0600 Subject: [PATCH 01/16] Update grab_api_manual_tables (for query_cats, though info is unchanged) --- inst/scripts/grab_api_manual_tables.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/inst/scripts/grab_api_manual_tables.R b/inst/scripts/grab_api_manual_tables.R index 77e751e..d59fd0d 100644 --- a/inst/scripts/grab_api_manual_tables.R +++ b/inst/scripts/grab_api_manual_tables.R @@ -1,10 +1,12 @@ # grab a couple of tables from arXiv API user manual -library(XML) library(httr) +library(rvest) + +url <- "https://arxiv.org/help/api/user-manual.html" +tabs <- html_elements(content(GET(url), encoding="UTF-8"), "table") +tabs <- lapply(tabs, function(z) as.data.frame(html_table(z))) -url <- "http://arxiv.org/help/api/user-manual" -tabs <- readHTMLTable(content(GET(url)), stringsAsFactors=FALSE) # make the first row the colnames header_as_colnames <- From d53133d760c821d18c8ad4937b61f6c24f856c77 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Fri, 19 Jan 2024 09:51:52 -0600 Subject: [PATCH 02/16] Working on script to grab arxiv_cats, but still broken --- inst/scripts/grab_api_manual_tables.R | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/inst/scripts/grab_api_manual_tables.R b/inst/scripts/grab_api_manual_tables.R index d59fd0d..08e132a 100644 --- a/inst/scripts/grab_api_manual_tables.R +++ b/inst/scripts/grab_api_manual_tables.R @@ -37,7 +37,34 @@ dimnames(query_terms) <- list(1:nrow(query_terms), c("term", "description")) #### # table of subject classifications # "Table: Subject Classifications" +# this has moved to https://arxiv.org/category_taxonomy +# it's no longer a table, but rather a bunch of divs :( +# +# categories are in

elements; short descriptions are in within those

elements +# full description in

within

that follows the
with the

+# larger classifications are in

#### +url <- "https://arxiv.org/category_taxonomy" +text <- content(GET(url), encoding="UTF-8") + +ids <- html_elements(text, "div h4")[-1] # first one is not a real category +ids <- sapply(strsplit(as.character(ids), "

"), "[", 2) +ids <- sapply(strsplit(as.character(ids), " "), "[", 1) + +short_descr <- html_elements(text, "div h4 span") +short_descr <- sub("(", "", as.character(short_descr), fixed=TRUE) +short_descr <- sub(")", "", as.character(short_descr), fixed=TRUE) + +larger_cat <- html_elements(text, "h2.accordion-head") +larger_cat <- sapply(strsplit(as.character(larger_cat), "[<>]"), "[", 3) + +larger_cat_abbr <- sapply(strsplit(ids, ".", fixed=TRUE), "[", 1) + +# ugh there are two levels of categories to extract, as well as short and long descriptions + + +# abbreviation, larger category, smaller category, short description, full description + arxiv_cats <- tabs[[11]] colnames(arxiv_cats) <- c("abbreviation", "description") arxiv_cats <- arxiv_cats[-1,] # drop header row From d5d58ca5c50d59dcf02b06bb3ad28bf0bb4e23df Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Sat, 20 Jan 2024 06:51:51 -0600 Subject: [PATCH 03/16] split script for building datasets into two (still not quite done) --- Makefile | 8 ++-- inst/scripts/README.md | 16 +++----- ..._api_manual_tables.R => grab_arxiv_cats.R} | 35 +----------------- inst/scripts/grab_query_terms.R | 37 +++++++++++++++++++ 4 files changed, 49 insertions(+), 47 deletions(-) rename inst/scripts/{grab_api_manual_tables.R => grab_arxiv_cats.R} (62%) create mode 100644 inst/scripts/grab_query_terms.R diff --git a/Makefile b/Makefile index c690d92..c785fbc 100644 --- a/Makefile +++ b/Makefile @@ -10,8 +10,10 @@ inst/doc/aRxiv.html: vignettes/aRxiv.Rmd cd $( Date: Sat, 20 Jan 2024 06:52:38 -0600 Subject: [PATCH 04/16] DESCRIPTION: update roxygen --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 69686cc..de1989a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,5 +28,5 @@ Suggests: VignetteBuilder: knitr Encoding: UTF-8 LazyData: true -RoxygenNote: 7.1.2 +RoxygenNote: 7.3.0 Roxygen: list(markdown = TRUE) From 27999301bd36df7d55556e01feea9efb1120992c Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Sat, 20 Jan 2024 06:54:37 -0600 Subject: [PATCH 05/16] Bump version; add to NEWS --- DESCRIPTION | 4 ++-- NEWS | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index de1989a..a892d48 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: aRxiv Title: Interface to the arXiv API -Version: 0.6 -Date: 2021-12-06 +Version: 0.7.1 +Date: 2024-01-20 Authors@R: c(person("Karthik", "Ram", role="aut", email="karthik.ram@gmail.com", comment=c(ORCID = "0000-0002-0233-1757")), person("Karl", "Broman", role=c("aut","cre"), diff --git a/NEWS b/NEWS index c12b528..4e938bd 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,12 @@ +aRxiv 0.7.1 +----------- + +MINOR CHANGES + +* Update arxiv_cats, datasets with category information, and the scripts to + build that and the dataset query_terms. + + aRxiv 0.6 --------- From 82f0154f2aa2ed16c01f27632e7fce025399329d Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Sat, 20 Jan 2024 09:37:03 -0600 Subject: [PATCH 06/16] Revise inst/scripts/README.md --- inst/scripts/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inst/scripts/README.md b/inst/scripts/README.md index bb5e712..242993e 100644 --- a/inst/scripts/README.md +++ b/inst/scripts/README.md @@ -1,11 +1,10 @@ ## R scripts to grab data tables - - [`grab_query_terms`](https://github.com/ropensci/aRxiv/tree/master/inst/scripts/grab_query_terms.R) -grabs search terms (`query_terms`) from the API user manual, +grabs search terms (`query_terms`) from the [API user manual](https://arxiv.org/help/api/user-manual.html) - [`grab_arxiv_cats.R`](http://github.com/ropensci/aRxiv/tree/master/inst/scripts/grab_arxiv_cats.R) -grabs the subject classifications (`arxiv_cats`) +grabs the subject classifications (`arxiv_cats`) from ```r library(aRxiv) From f58f8486917ffda806928b2c516391c7fc3393c5 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Sat, 20 Jan 2024 09:59:55 -0600 Subject: [PATCH 07/16] continued work on script to grab arxiv_cats --- inst/scripts/grab_arxiv_cats.R | 35 +++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/inst/scripts/grab_arxiv_cats.R b/inst/scripts/grab_arxiv_cats.R index 5ccadef..72e334f 100644 --- a/inst/scripts/grab_arxiv_cats.R +++ b/inst/scripts/grab_arxiv_cats.R @@ -18,21 +18,50 @@ ids <- html_elements(text, "div h4")[-1] # first one is not a real category ids <- sapply(strsplit(as.character(ids), "

"), "[", 2) ids <- sapply(strsplit(as.character(ids), " "), "[", 1) +# categories +cats <- sapply(strsplit(ids, ".", fixed=TRUE), "[", 1) + short_descr <- html_elements(text, "div h4 span") short_descr <- sub("(", "", as.character(short_descr), fixed=TRUE) short_descr <- sub(")", "", as.character(short_descr), fixed=TRUE) +# should be the same length +stopifnot(length(ids) == length(short_descr)) + larger_cat <- html_elements(text, "h2.accordion-head") larger_cat <- sapply(strsplit(as.character(larger_cat), "[<>]"), "[", 3) -larger_cat_abbr <- sapply(strsplit(ids, ".", fixed=TRUE), "[", 1) +cat_abbr <- sapply(strsplit(ids, ".", fixed=TRUE), "[", 1) +cat_abbr_uniq <- unique(larger_cat_abbr) # ugh there are two levels of categories to extract, as well as short and long descriptions +long_descr <- html_elements(text, "div.columns p") +long_descr <- sapply(strsplit(as.character(long_descr), "[<>]"), "[", 3) +long_descr <- long_descr[3:(length(long_descr)-1)] # drop first two plus last +stopifnot(length(long_descr) == length(short_descr)) + +# highest-level fields +fields <- html_elements(text, 'h2.accordion-head') +fields <- sapply(strsplit(as.character(fields), "[<>]"), "[", 3) + +# just physics is split up into sub-fields; look for these sub-fields +sub_fields_physics <- html_elements(text, 'div.physics div h3') +sub_fields_physics_abbr <- sapply(strsplit(as.character(sub_fields_physics), "[<>]"), "[", 7) +sub_fields_physics <- sapply(strsplit(as.character(sub_fields_physics), "[<>]"), "[", 3) +sub_fields_physics_abbr <- gsub("[\\(\\)]", "", sub_fields_physics_abbr) +stopifnot(length(sub_fields_physics) == length(sub_fields_physics_abbr)) + +# expand fields and sub-fields +physics_cats <- cats %in% sub_fields_physics_abbr +fields_abbr <- () -# abbreviation, larger category, smaller category, short description, full description +arxiv_cats <- data.frame(abbreviation=ids, + field=, + subfield=, + short_description=short_descr, + long_description=long_descr) -arxiv_cats <- tabs[[11]] colnames(arxiv_cats) <- c("abbreviation", "description") arxiv_cats <- arxiv_cats[-1,] # drop header row rownames(arxiv_cats) <- 1:nrow(arxiv_cats) From a7fd0ac84210295b0279d308a79a2d340a93dbe8 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Mon, 22 Jan 2024 08:11:46 -0600 Subject: [PATCH 08/16] Finish script to grab arxiv categories --- inst/scripts/grab_arxiv_cats.R | 45 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/inst/scripts/grab_arxiv_cats.R b/inst/scripts/grab_arxiv_cats.R index 72e334f..f685bfd 100644 --- a/inst/scripts/grab_arxiv_cats.R +++ b/inst/scripts/grab_arxiv_cats.R @@ -14,13 +14,12 @@ library(rvest) url <- "https://arxiv.org/category_taxonomy" text <- content(GET(url), encoding="UTF-8") +# category ID ids <- html_elements(text, "div h4")[-1] # first one is not a real category ids <- sapply(strsplit(as.character(ids), "

"), "[", 2) ids <- sapply(strsplit(as.character(ids), " "), "[", 1) -# categories -cats <- sapply(strsplit(ids, ".", fixed=TRUE), "[", 1) - +# short description short_descr <- html_elements(text, "div h4 span") short_descr <- sub("(", "", as.character(short_descr), fixed=TRUE) short_descr <- sub(")", "", as.character(short_descr), fixed=TRUE) @@ -28,11 +27,13 @@ short_descr <- sub(")", "", as.character(short_descr), fixed=TRUE) # should be the same length stopifnot(length(ids) == length(short_descr)) -larger_cat <- html_elements(text, "h2.accordion-head") -larger_cat <- sapply(strsplit(as.character(larger_cat), "[<>]"), "[", 3) +# highest-level fields +fields <- html_elements(text, 'h2.accordion-head') +fields <- sapply(strsplit(as.character(fields), "[<>]"), "[", 3) -cat_abbr <- sapply(strsplit(ids, ".", fixed=TRUE), "[", 1) -cat_abbr_uniq <- unique(larger_cat_abbr) +# field abbreviation +fields_abbr <- sapply(strsplit(ids, ".", fixed=TRUE), "[", 1) +fields_abbr_uniq <- unique(fields_abbr) # ugh there are two levels of categories to extract, as well as short and long descriptions long_descr <- html_elements(text, "div.columns p") @@ -40,9 +41,6 @@ long_descr <- sapply(strsplit(as.character(long_descr), "[<>]"), "[", 3) long_descr <- long_descr[3:(length(long_descr)-1)] # drop first two plus last stopifnot(length(long_descr) == length(short_descr)) -# highest-level fields -fields <- html_elements(text, 'h2.accordion-head') -fields <- sapply(strsplit(as.character(fields), "[<>]"), "[", 3) # just physics is split up into sub-fields; look for these sub-fields sub_fields_physics <- html_elements(text, 'div.physics div h3') @@ -51,21 +49,26 @@ sub_fields_physics <- sapply(strsplit(as.character(sub_fields_physics), "[<>]"), sub_fields_physics_abbr <- gsub("[\\(\\)]", "", sub_fields_physics_abbr) stopifnot(length(sub_fields_physics) == length(sub_fields_physics_abbr)) -# expand fields and sub-fields -physics_cats <- cats %in% sub_fields_physics_abbr -fields_abbr <- () - - -arxiv_cats <- data.frame(abbreviation=ids, - field=, - subfield=, +# create fields and subfields objects of same length as ids +fields_uniq <- fields_abbr_uniq +fields_uniq[fields_abbr_uniq %in% sub_fields_physics_abbr] <- "Physics" +fields_uniq[!(fields_abbr_uniq %in% sub_fields_physics_abbr)] <- fields[fields != "Physics"] +fields_count <- sapply(fields_abbr_uniq, function(a) sum(fields_abbr ==a)) +fields <- rep(fields_uniq, fields_count) + +sub_fields_physics_count <- sapply(sub_fields_physics_abbr, function(a) sum(fields_abbr == a)) +sub_fields_physics <- rep(sub_fields_physics, sub_fields_physics_count) +subfields <- fields +subfields[fields == "Physics"] <- sub_fields_physics +subfields[fields != "Physics"] <- NA + +arxiv_cats <- data.frame(category=ids, + field=fields, + subfield=subfields, short_description=short_descr, long_description=long_descr) -colnames(arxiv_cats) <- c("abbreviation", "description") -arxiv_cats <- arxiv_cats[-1,] # drop header row rownames(arxiv_cats) <- 1:nrow(arxiv_cats) ## save as data sets within package -save(query_terms, file="../../data/query_terms.RData") save(arxiv_cats, file="../../data/arxiv_cats.RData") From c6a0b3a3f496a485a8b8e6f980b5a11a6ad5a136 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Mon, 22 Jan 2024 08:13:13 -0600 Subject: [PATCH 09/16] Update data/arxiv_cats.RData --- data/arxiv_cats.RData | Bin 2181 -> 13695 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data/arxiv_cats.RData b/data/arxiv_cats.RData index 9b22f7975d94ceb04671cbd1db13d4fee442ddbc..307badf3e4d168008b2138ae696ab4b1b43bb474 100644 GIT binary patch literal 13695 zcmV-_HGs+=iwFP!000001MPiVk0jT5*pec3p~SmrOR^=E99x<(s+;0f66Yneb0>Fp zM>``@ju9Kx-KV=tTbE{4^~^ROZ07fIvNB{?cACjjaKoY#Fq{ikl+xbcY_PkiF> z8=rU*{=D(z_bv_x-@b9%9{K6<*$yju7%bwVl(*n9Q^xjY4PN^ z9qyjk!d+9?+p~oOTR5_XTec8&@UAWVk}2%pv;{lPzU^)Q)E3T7;o!m)4(&TfQNd2< z$i8!I2R*UPPHeMVcBQxOn8N9$M9u-gCjf&@IanTP{X5DyK(VOT<~9T)G$#dzG~cvdIz(|6+Ht+?@N zJjv5|H*Uw1yd8ISCw_V--siJ;&fkq2-|ZFSp3mc1oyW5}*T$dC+ODYw3v)1`>fmgx z>S%YZ>iA%->g05->Ri|IEWqYu0J9iQ@^(;FQvJNIE?qmAXGO98plgajJI>|7GyXx{ z&0Qn4FZkN`ms!;fW?3swp7u}LdfKfuZ@a7;lx{paxgBIy6ew|`nufE$tX8n`^MwBD>w8# zF0!`W)VwYi8>%N+mkqM2zZf^!&~{oZ2a{FRpWe(>8zwz-n-v3^ zb{f96UUcx&U-tsmy=n6n^I@Ak$!@4bI}PTLU@tD)`C<9=FfN~+_R4qT^4;F^^NYBA z;Yl*m3&Z-T7?>LsPY>eat*Gd?hKsDDs(6%U?Rq zn#onxxMa5(&+`tn`emaJzrbb;K%KgR*RfQ%`#=oj`OEQhV+cr97)jYA5SapgDt_(A z0a|w+!tJ2+@~y0zEwh=v{l&dzwdm?ulL0N}-racFgjs#V8(-edTkYhG$Xf*jZCuZ) z96Q$?(B>_7R>NFf@9gu?*|>3?OV;(-ui(hZdDkrIM0j$$#jJ*v&P#3e(m@Hd4=pbj zZagR27`p%Y!@4PPSlK}7W7*Xu_8rlc7nI`P0vKpA0|cKw!krEiOZv7g>h&lqUEJfB zj+bRt4SZ_^z9t*noj<9jI72$=y8r^YyUf(7_JVyFb^P3E29R30(h&W}5tvy)iLEo)MwSmCTpRdYMUBH$ajFe|jp4ZE2;v35LSPCG|zKZxH8)8pX zmak*Is76D=qq+h~O9A0?Y!OB9@KbwUp|x)<)jlGlvo3kUd;z;*yD zZo_oHbW*~OP?LA_iL2LJJb#|gkZP?xc)$`~gbgg0qNM>2J0JrJN>YSf?$N>L6tt3h znw;$)wCitN)C(&O{mdTvLL;!G$=nGqKMgY(0cGxLvzyQEWt{`UvWoHSd7<~oyFjzD zQQ-mQrTu(5bq%0VzD~ZM-(OW($ygJCg8gM zh6$YXTYvr-s5UN;w&T(@m$|$8P(t(4I|DA$YsYmY@;*t_MtAU(hwbgfgGJBlXvov> zWaN5spI~|2DEw&aFWgxG<;p(@{4$2U&jO&1vQb_H2I(cA9w^=gVD@~*j`7l6w*VFd z2fksK0PYb?;Aq#@|16-{xTDjH))MRo@q@2>y@F-|Lh~#uGU|DQ@|7H9O*d!mUSvQ! zkw8<0?XMOmuoYw_N z@kDh{{K_`A`Z|aLbu$613Jw(71Lh|^a{1L$*VZk}kV)0;tddd;W6uNp0gPou-x=dM z%0}()X@!p?Blx_8ME<%rUh*f3NDP!UtHLb43_=W?WPwbuX;7hr*V%tmSAMGJO8{^T zqyc${6i#{Yr$rV0`tBrK7^pgT#dL6|nZZteu+H0`+nZ;|s}rHN9+vk2!j=SMD(0~W zO{TXxo?P88ZlIix?Az1aAJ-fSD6gfatT>a(U4p~1VXbr$dn<5grHhM-H^(ZsjkT;wW z?7AyA)IT2>Z~Wk;Sks9gZj*$s#6L|8RV13WMz^0=usQ%FvC`Nxhs83V^kLyy#hiH9 zr+`SuwC8Jbo1;oua%m8`p89c;|l?RdUW_D zB+q~>!gpiE@=Hu)AK;$&BCi+UX=9Z^2}bm?Fd9#Dm!q*_yGWzKtaOpL?NT&8q9WIT zWb89oi$3r$)bl$;gsk`MtZup~P>~vVLx>oa9-eG%0*D0IN@!_udMmESc zkv-<`x%PLi$LsL&fr zfYDyI)8q=Y`*{K=;F8uYG9bqA$8z)oKx>}j5D0z(4`4zFZ7Jy3<6=1h@FFsArorl0 ziTz*V!E9v(?C26WV>btFq|AV-!19Z1kzC~Z`&{V79UNnUc82TY<~` z9yD*dEU!Q^fSz_whOm1uWU&hKEJYBa279N;xh#(eH#|QXzB>G7NAKDhp(p8F$mu^_HH`{%Hh&v zwCobZ2^_Y_?>X4du4XiRuMl^hk5Y%&29@k<%_ zrHBMHYU*qP?O-#J_QBC@tu+m+PJ3X3Xs~2{4z!c3hADvI{SMAp4Aubo-FUc-r;k(X zQxIZ&1~qT|-X;lP+;?iU;?$7cCn;wM&{=hYBWbc&j?g-L&#mP3cEwy6CQwfY&l^(Io|jmewGX5GKToI2$d}>5@yaI{LwMz#vs_KTZKk%l{YHj!ZrE@{gK7paO}u6 zCgS8P$jP??siw_mEC)`2h2beM(?a4g2E1oz2;{OO{&8y$joz|ki}@DP zn9b7r_CxH=F>;NRe%|DtZy}C(Nc%O}hOzY~qzREPTnDH<>6+-V2nLOZn9|YkRdzH6 zGW6>P-o0EaqCsYyCjr173d&ze8PpG1Iq37!$NzE=wSd)(=Z*vnOGE;isSvCbqFh#< zoP&Zx3K@-nci_fqZSspjtey}{Es&~`S%5}sVOxR0C3~lL&IjD8GY(yNL2u0lPpfS6w5C1yxs&S$@^k-vEza$kKC^SY*+ROdM+R@ z%p2hHqF#teo+Lttt@vD$tyi`|K+X$de50Zs-(v&2Q-627Jb<|zW!kI|2xB0a;N3ac z7C2kbTW2Wts-_y)YR<+?YDkEN>PIg8KOSc5)om{vrxwF&>x6;zQ~(l%DmKjTbl|Pm zwy)FIJfPaHt(C6}&;sII!qfp4Hd zL!^Op`iu=7h@LG^rqBM(4&php=A&#WFeBoMz;>L%<$skEontr=NRs4hNKr{|_~@X@ zdHMzDII5#*Kb&-UjM5=P4FmKs3Pb|FxJOV%66d9_O=g@BRrC=+_r~A!)M^4`kQfXT zBMKQyt-=Wepi_BLg@cC_^S;UQ-7Ea_D*uQT0{?tnr}smdULUKR_w>c8P^3kLFbYsc z=|n!7VFV;rAr=D7&rvL8oO-vc>8?Q*$;%hSI~Wsv z83@m6f@;-}P71{|M;T&uMV2SfNWIWCeJDfmD{L}*b;!SBNPE3+Pot<2`13n`!=*U^ zb@X|&845m2VWRIZ-O_nWX7KDhq<#C_Vn}^6_(v{vMi1mtX{ehzikWBQyMM))gtin` zYnp&}aM|`C9)uX#sGeE0sait`ztEcbcwVs6N=E7GD?+P{UYDJO(q&!K$93*e+);}| zTf283#@4tmKGXeR*!#G=0Lj^R9+nrJ1Ud5fK}6779a>b1Wvre%DboxR<5v8 zuDAI3*G+_WyQ<&-C<-j281*&nrUho<@fvzTo2PMIl*?AdEuK@$QR>#f@8`0anZA); zD`=Be@CNG$Z(xaBeat^7K;lir^|!H$cL@JwKc7_tEnBn5U5iVR+X52PkN&i_(=eWi z8!cxu0(uLpR_XjGzO3uWHH&(&A%bK-?~sh-)NK_+*?0}35fQ(|a_@$)fR8}3AOBxL zlk&_l^I?Fb7Z?Y88!k{-5ZlCp*$NtI1xWiro5#zC(d*fGOj=0K2JIn$MLe$HZkxJb zsT)DBb)63dK>em}JY=GGp2z0rEpu0@+z8ikcL_fMleerg;-2cNiS%{J+f(f6K$bt9Gt?TD5!&b_dt+K)?5TbynsY@xDWy0 zm!ijA>7C(7bqlHhdP{5IG;Jqw&TMq>b4DsSVMhi@pQ}-mPk82f?a*Bs4H57vK&s43 z=R*5flLA5@?V6gB2llr$Ncr+0jWAdn8TFKHLe|zO?=?OG@;ZAAI?T{oX%#D)3?AHS z4gI$D7tEniV{b6z#dku=k94= z5%g>w=GA!+A#SO2CM}xe-bgN)0EsB-1&_HQ?D|ttfr4{kGW2&o0R<=<*GnR# zBbSx9k=hKzX`noyfC2al_1@`kFK&dsPjOAqA)I%14^62YlR>YcP<4rfJ#Pc$q)i>q zYhafZxf8jqIo=&O1t42M6-G|G0;;M*!eAu+hof8v7nPL!ob??m6M6^3P9kauS;CAH z13LLZog>x8$Ae3{A{$N|S6$z83~yIW-eD}HSCI{8bh9!|a4^%3AQsMo;uI+84E;xoAiyr#6>PBCFL`GpUWq9PJ6PIS*xE)ssUWu_J6&}i5t4X{!UV<=76kK=Fo9O>ifg5<=$$#lcqNd>IK)`T-3VWS2|ypLNthkH|iNX4rUW3QJRFx&|P* zzcJ3}5CKdR-CWp7Bnb=&<31(KG{y9H zFl0iObeXqHPde2jloM~<>3Np%U3w>Ihj6@2CMFZR0y_I_#+GwjD21A94+{73$!tV- z+Erx5B7O6Mgb)h6Kk?T$!Mn9qMv6h-Ehn7OM_q<_gmrpJHBd>5Arrb}r>BA(1w;@G zSp}d})C!bC<2;SWxNklH3Mx82YeLXjnfx5j^yiFAA*M07&7h6|jN#AB`J2)yFCfT` zTNodAbznMYlGyqR*5aw-?v~4cwG&=hesLH`b&t>)up?y6i!M}c$^iUKWtQiG=V!h z5LWuKrXjO*EUdA|hsa@GO_ag}m}xi77)WKZ!0Sz-mk!3Yc!c=y2N(@A&lj8C^~ZJ% z3tQ9tY1KXDR82JMA)v{?4ufE-3PP~70B~QRpUd;&zxBv$^2luR6Kpp58#eh3!Wzej z;3e!7=_=9qIVV@)K{@$I|0a%zt;i2hq9b>%aOIu}Ir9l}WSCAbgPnr zd+Le-4?T;Kq&SdbEm>f~RSCQ((Q?iP6QAhQGo0-oe1ng;b8m8G!|1U7=vUUwZu0u$$_z8r*d?rSFk`GX%MPt8izS3{>TsgaE;Fd0|%5X zZ8UpPkIa1u8TPj{F$OtKEf~4?mnQqd6eNK*+;V8ujD{OJbytH?Jz1IPKTebFJW70>A4mm=R09nvS(Lw8Cb==^m66=kY(|$II@181w zLpnCnKaq|B4io1>ORB&GohuM%UE9njDGOJYq~qVbX|EV8qD(9iSu%xivk;f3_saNW z!wa$I)bEZjK2jX}S^~DBr4U0{B!d5+E&1Dm8)Fyo`+5ncejOK1Jn_)XL=TUb951~l zM^i{dUSKR5-6fREBgJOmL3v9_A`T;&g}mdSmsr56ex&UfZ4Z$N`$8K*^cqI+ z*$Vga+U`QLM7@B^!+Ofwj&)A(n^NB!4`!m`AGHx_D=%p_5L3wE}-C#I^D3nj%D;H|A@RU=^1mKP4wtY`_2 zl4XfZqXP)o<|ssyBL(#AMNO8f_fAgFTECg7W|1Y!Y|{v4dJnYl!nM791^B+qJ+L7$ zvuyHr$hN!iDQtPAX%cF@P!+0Z^=u;W8t~P7DgboLZ3j5!@{p6s~Gxv%23m_SSW+ z-t$yke{7GT4_zNN-XYT@svP|QXaT4x%Tvc8t`FEIy-UCYN677oGG)qV2X*)CO3KA)w-d9 zkxNBP0Y)K@8u6Y*;N|3k<}KBCod(Cn_6;N&Z~&O_aZf};dXdD+KB^9rV^`#L>+)7@ zCrmWx81t$o>Zm{}t{?ScqqS)>QB(mkTD2Ve?;lCXHqCm0_u;u>$C!ginI$;@ZNLBq zwaShNXt}SEu6Nj!wx}C)79*}}8g7bt+UZ4pc~IxasBeuh*UywQ3b2=mdf4e-{F6oe z?l~q+@mWO+^HJpEnmhc(Ai0*mD&QrokF=({$V|xajR7?7 z+5WWW|MWeKK^cRDyO{^c9^V917#!pkKvkH9Y{sE$=c9~5#eyJ> zM`zrkk~~G`)7t5f?1|y7NDnEz54TBkh4ix7AD_As)P~B5@v?xJ_sjG&WWgCdE_Fm zz4ZTAG&jG+jA8tV0p^Lt#)iAG$!>F_#DC@M+BPzs|9N4pkg97fOak_tZhparsR5qp zkptVkq~#6;Ob<630;aX)#EUgI8Pzp8Xh2&?|v9>6>PnnN!-TweS~^ zk$i;0yYihZX2X(|B^0YwdD(iYJ*Ui3ZxZ8 za18#vl`lZFeh@Jf)q?8bu;tflW*#WPi$CvJtupm@yYtI z8Xv}@pF(N%Qz#~xpbfZ*EACLmS(aCYyR^nY(#1L*XXNSar?Txd0<$39s9 z{t5^SMD3V~M0Bu{#!8r@o5X=AP0{Qpuh{%AsZG*AS6eaaoDq|ZHUl-tnaf2H`^M;y z)qojeQP90Fpcu)1FX7!=B6-x3OgHa+hNL1xfY5%WZkP;^g)S@O;I#|Sj@#J48j>zh zdTO%MBjR%&z-^8tTb zJID0~V1~&}m-$c0#Lccr01AyJm#E}Tf@O~9i+4saHIg@W@a;0Y*Pv9R&xKC7fQ{{Z z8?w_jE~x%M9)KFg0jTU!3Zr@<=g;*&r&P=5l&oy$**9f&ZOO5UX)j%nu_;=kz_2=} zH}YkKf7YLd_DnE)Wi~s~KSJg|NtY^Ga2wLns-;jgipbiL=~inblPG1TNQMqGc!+8Z zIRylO{{HS}E3+D~u`6#EeJjNBq)|Fu(c;rY6Xd4hu8P$=fP55%HNS1Jj;KX|FDQ^+ zkBTE^?T{*QfYWAq(1@EuSE@%Z>W=+#VaiE1rR;92)RK1=>#?G0oq!^mWJ#yfH<7&f zToD4UV`8s-nPHUVbTP;Z@$mls;YG+y+hq1`(H@9YbvaV&a!nyMKm&_n- z(V%CV@K?8c$^z40Jn+=o-0!b^y%mz?#)+mEOe z8pzK7&TV%k0StT}&&Z5n{Uu%#?`ttT##4b;FX$Q(sV8I{_8T8ob(-Br;q)7A(lZnV zM92FL;uW*`RCLchai{AR8c50PXPCT;9|W+|M!YztwD;S)n=)mo;^y-w zhdayaa+CM}h0kr_lR}X|X{}Ydib!mHtN;B94Ryl1oI6qSIFn;_C(@t$1_Zwle-Dep zAsYKhuciOkn8~`neb^I5Alq#Xr+LE6QEY@~BE{I7f~$!Rz&- zy1tjVrl~bGJiAJO66Z*#p42J$k>eDp*t=S;P7 zBKg7vhnl&SyC9D_027-#^u8o}Ok@cQHl%T01zE;XxKFFi7#%5kc)H8(Gahz9FL|Tu zRbm&MW$p1+S)1@N0)Y;0dp#u*y*{sK=j;poNommy&rLLp@-|0bpSG{?w@4TKWz#T1 z&OQuZ5%bI*MZC1%VvTPApzjvh1*MY=>Q&lvg7EQs&xBzV3m9~q_w_S0Z|dm(?SUV7 zCHOJ-E&aXi4e6r23gl%wjo?YWnlD$`W9LBMwxbtdO{SUH^^Iw}$4odtm1v`|CJnVk zALD|36SC6(7<6#Nv23xjYWV~-4^pcLaOD!aCKq5lDJZ96ayQDt5MSD-&=;Sb-E<_^ zVtqfomd6g$LkEDtBv3jw+cj{$oM<~YeCQJ|@9I5t)}V(R=R$dpML_WO|hF z-Z#a&YWy8EOj!v>im@2_P(D1}nW|f81nFh=;5x55tF(Gc>?OBIQKh^0At5MSe&}lLf}uW&@Qe9G}4 ze5{nFc7Wdx#ll7O`6|Cla;j>3)<8pv6t+qB%^=qZQrjO*kowv4pvL!1PDLT&pk{;D zM6q;%-P|Ndw)nFt*aG>+Ro>!t1k*10_l>H(Nv3cU?k-_~(i++Ez%w^EP%Vk#%MZHEPJF*WLp^zj9 zMJ1>1WqJu3; zhY_JJ=kx#z+91q?ytB#fsdu>f3u^arB2KMsYU#+%nlTV>b!Zf|PV}$tfxI%H%dN~= z6njipI6||SYmB|beflJVm{AvP8}MicFCxh8W5DzuEN2p9d70zKS?4vS?=cC1QY|~y_HiKH+4HPvSsHahb6i= zMtG~A(-i1ZUG>3=??&~aPX9cBWVYbzAWer?QOQ(a)xTP z>Oi0`V0pLyAT|Y0=xPeNLZ)>+xpE_9cAQcyU``x94RoeJa0ig{(I39W4hE&b`}Cci ze5$^(Uq-{3i_FXZZdCeUD^;9Am6Bc*OESL5M#~WYa91!P(u64O=g2z`47u2nr1ca5 zGz02Z@I{stAnQF0r;uzLibFyz1uR3#J=mOo9d*-N!oHKM27j zfhmUjY-(az|MVGVQKS!{=L(D=vgV(@KY(Y$gT3{n*l=LOquWEDt~Z(+$^QZx_?NSd zgqrcU+QNg7C4y2i=1nOEN!!TW?$9*Bnpt((wHT!!A9dHSkL(B1Zc|7^1i<7JcAIu6 zuX?@yaLt~jcZAJO*HeAV?o0@KO2bW9pX$kcUc)4r)O*G@H$&J68BeD6gLC~ZcT13` zL?F)p0$4)d0tTg_Ru++00G*U1hYfgcrlQ(4pFNQgf9YMHaF=Z3ZZZ=oD}a@JEZlqw z6|*m1()P=#dQ6|_tqh;N;I#LtmGIfWN|D~IHTjwC;>^KPQY89y^Sn%JxfYr$d zCR)q`KH**S8g_X0UCQdT;=J0(ttFSWwveg(!mdeVd2ZaR@7ipg()bTFH}*MwFcoJ& zUx7PU3`qG)hj{1cg3St2qc>o^o98^F!aSkuoM*;fc7gGI=W>JgysQCssQw(eJg&jt z%H^r#`2Fi$kH@#`%6n`Witwh5sXC1?`49C|Y5i=7@}=1ad>{~Y(?O^(xz+mldOz%A zg3y4Xr)&mWbHA8)tblza6=-;q_@4PTZ-BI86~w|t<_{OYuoxxj$Q(nR d?bGn@@!z;X|NS=n```HQ{{ST6i>eH|003YSfCm5o literal 2181 zcmV;02zvJ)iwFP!000001C3ehkK4u(m+ho>-1^ZZK-BE-Zr!-iyLk(~zIPLT-RQm7`vCslW$IFFehyiY_jo)0Q0;@B{cAH}BOfk<4OPie_Vt$MXyq^VDC%~eEAtuaDZ>OWlq6b`ESF9_+7*^pI@4NYbs_1o@{qrQALDJ)J@(_ z%P>ZQDzXLN&!Veb!0BUQ*zw$JN3m_&!9ph8@WXC6@x!>xZOM?Y9YVs0yM%->%X1f8 zCYfV+#R}%L)E>TwOMi8h+M29++Q~qpF8v~r7mi_(=KhKowv2?Y9Fk>Lz#rSDPG~Me zhcz;=M&dwcGIOBwjD@@#zw@?aX|)I19Xt*oOksDx;`v@b%VGh&gpjGx^br9J!185L z0h;+|WPZZJNTl35<4mO@T|0$LcoE5!jOF^Ong58)pGRCOGxhmvt+>$h8}1#6iVZ;U zEs-$~%a|Kgzt{mC@HqC4c*HhBn#22I2X@R;nRI@7xdS$-muAYF^5UIT|1k{T?qHbm zQh`xDmaf2v>r@~UKHb4HL&@g59hkYyK#j^_mF&?*#wC*dmmPd$xOT{OTL{W!2$rmN zJy$a1dCsr1)49m=lGm!u)S9(n^`ed>uDl==JPo;yJ_W&3@USQqZ(!dzVTHI5p-{*5fKj;3OMDGC#nN7u59(ls0euHXjluJy<@HITslw6xDb#T(!vX_z&&kjJ=+C_ zVlP+&`kMpP)CbsF<@L(cDKA{f?guA(sU1&JJ5yJ`DQc!=0{KVpybpN<+Jb$DV3z5~ z(rVI;KXPGFVS4VEpL12Ye!p*^hCmWX61}F$$oZZSS;(bC$IW#*jfZje-LsECmg#Dr zxT|C-p&@d#J%>}NFP9Lt24D40E0&=;6Y%)LugaeIW!tiUcZwL865B7Kv({;^u2=5ZB#nKoz(sQV1>4U{4603+{n3LA=x3$7qZ ziS72llGucS&VIy~mWIb=8aC~Ql`AZusYKi5@3|XjInJ|DmGavT{?x)LS!UF4f1-#J zsI=O*m9sWnZ3KVkUUO$%vaI~0U(Z$|PxNgAU05izDot*^NCleLmos;GPHIC~{B=sd*inwd%tX)s7;tBMV-UyOGXdWS+ zAYMp!kVfL%>XROH5M0P#bi=xgYB#TVnuErZD#JTq##%Zb@Gych*Hvc>gPZR<);)_- z94@OMa-^-Oy83|B4oZki{U`>#zSwke^tH(-3oWCXA+{LEoE%Ze2CI z0OL3iC`!6AO(7^e=`EVVUZJC@dn}VF4^axJufkjT3|ZzlF2&03v>tc!CSvK@A7g-( zGLW52LDMbGF4Lc;kafzWO_W;WA=kGGsT}%D;I6W5vWF-i;5W)AIL_;DBRG`g@oyWl z{RV&xHrG#>%vv8QSiff3!TSkH4*FHontot1fElL$A$=db1C3Pu?0M6qYqne}z7cpc zzh7~<+l#CkqB1^E^4)=vb5G0pZ?F3K)n2T6vThb@|3|AIrVlRQuET-)pZ@$GC}vKB H*&P4?;J-pM From 4fa66d0a993eac45a68de955a65546827dc60cc9 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Mon, 22 Jan 2024 08:13:29 -0600 Subject: [PATCH 10/16] Update help info for the two datasets --- DESCRIPTION | 4 ++-- NEWS | 2 +- R/arxiv_cats-data.R | 9 +++++---- R/query_terms-data.R | 2 +- man/arxiv_cats.Rd | 9 +++++---- man/query_terms.Rd | 2 +- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a892d48..8adf917 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: aRxiv Title: Interface to the arXiv API -Version: 0.7.1 -Date: 2024-01-20 +Version: 0.7.2 +Date: 2024-01-22 Authors@R: c(person("Karthik", "Ram", role="aut", email="karthik.ram@gmail.com", comment=c(ORCID = "0000-0002-0233-1757")), person("Karl", "Broman", role=c("aut","cre"), diff --git a/NEWS b/NEWS index 4e938bd..c917c3b 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -aRxiv 0.7.1 +aRxiv 0.7.2 ----------- MINOR CHANGES diff --git a/R/arxiv_cats-data.R b/R/arxiv_cats-data.R index 07dcff2..3def2f6 100644 --- a/R/arxiv_cats-data.R +++ b/R/arxiv_cats-data.R @@ -7,11 +7,12 @@ #' #' @usage data(arxiv_cats) #' -#' @format A data frame with two columns: the abbreviations of the -#' subject classifications (`abbreviation`) and the corresponding -#' description (`description`). +#' @format A data frame with five columns: the abbreviations of the +#' subject classifications (`category`), the field of study, +#' subfield of study (within Physics; `NA` otherwise), a short +#' description, and a longer description. #' -#' @source +#' @source #' #' @keywords datasets #' diff --git a/R/query_terms-data.R b/R/query_terms-data.R index 3a23f55..c3473a9 100644 --- a/R/query_terms-data.R +++ b/R/query_terms-data.R @@ -11,7 +11,7 @@ #' #' @author Karl W Broman #' -#' @source +#' @source #' #' @keywords datasets #' diff --git a/man/arxiv_cats.Rd b/man/arxiv_cats.Rd index ea29d1e..4eb4479 100644 --- a/man/arxiv_cats.Rd +++ b/man/arxiv_cats.Rd @@ -5,12 +5,13 @@ \alias{arxiv_cats} \title{arXiv subject classifications} \format{ -A data frame with two columns: the abbreviations of the -subject classifications (\code{abbreviation}) and the corresponding -description (\code{description}). +A data frame with five columns: the abbreviations of the +subject classifications (\code{category}), the field of study, +subfield of study (within Physics; \code{NA} otherwise), a short +description, and a longer description. } \source{ -\url{https://arxiv.org/help/api/user-manual} +\url{https://arxiv.org/category_taxonomy} } \usage{ data(arxiv_cats) diff --git a/man/query_terms.Rd b/man/query_terms.Rd index 0e1db2d..7dc80f0 100644 --- a/man/query_terms.Rd +++ b/man/query_terms.Rd @@ -9,7 +9,7 @@ A data frame with two columns: the \code{term} and corresponding \code{description}. } \source{ -\url{https://arxiv.org/help/api/user-manual} +\url{https://arxiv.org/help/api/user-manual.html} } \usage{ data(query_terms) From 8d6ac75784f8074723139850279326d22c40788c Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Mon, 22 Jan 2024 08:15:38 -0600 Subject: [PATCH 11/16] Update query_terms.RData (newer storage format for R > 3.5) --- data/query_terms.RData | Bin 353 -> 406 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data/query_terms.RData b/data/query_terms.RData index 6e12fb87de01fed0e26bf88fc2ed2e417bfac809..c19ab69b4b8159b0096a19f5295496929eac6c4b 100644 GIT binary patch literal 406 zcmV;H0crjpiwFP!0000017%W8PXaLzt+1=I2E_OWdhlYRXyV125{MxtctA+-M7p~J z*6g;}&T9D6#$SMTp<}YxWYg)JdGqmRNB!wdG>sTzbr#fXtR6a(g_H5aY^QBwLOe(bSzV(Jb zI~kfjL#zBl?$6lf+v{zI0;D=0wbrV=**29*r!+(_}!YmaCP4HL=i!ZC0 zpzwAY?PrOK&h8}+7nMqpa`_Sp3$|O Date: Mon, 22 Jan 2024 08:15:56 -0600 Subject: [PATCH 12/16] Update of datasets requires R >= 3.5 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8adf917..3832d3a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Description: An interface to the API for 'arXiv', URL: https://docs.ropensci.org/aRxiv/, https://github.com/ropensci/aRxiv BugReports: https://github.com/ropensci/aRxiv/issues Depends: - R (>= 3.0.0) + R (>= 3.5.0) License: MIT + file LICENSE Imports: httr, From e08ff9268b399da81f5af4c0bbd6987f44da9402 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Mon, 22 Jan 2024 08:23:14 -0600 Subject: [PATCH 13/16] Remove reference to fulltext, which was removed from CRAN --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index f337262..8f9325a 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,6 @@ Licensed under the [MIT license](https://cran.r-project.org/web/licenses/MIT). ( --- -This package is part of a richer suite called [fulltext](https://github.com/ropensci/fulltext), along with several other packages, that provides the ability to search for and retrieve full text of open access scholarly articles. We recommend using `fulltext` as the primary R interface to `arXiv` unless your needs are limited to this single source. - ---- - ## Citation Get citation information for `aRxiv` in R by running: `citation(package = 'aRxiv')` From 9568edef59687ffcbe14bce6bb2d0accc4342e93 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Mon, 22 Jan 2024 08:24:35 -0600 Subject: [PATCH 14/16] Fix a bunch of URLs --- README.md | 4 +- inst/doc/aRxiv.html | 655 ++++++++++++++++++++++++++++++++------------ vignettes/aRxiv.Rmd | 10 +- 3 files changed, 493 insertions(+), 176 deletions(-) diff --git a/README.md b/README.md index 8f9325a..6971fb9 100644 --- a/README.md +++ b/README.md @@ -70,8 +70,8 @@ vignette("aRxiv", "aRxiv") * [arXiv API](https://arxiv.org/help/api/index) * [arXiv API user manual](https://arxiv.org/help/api/user-manual) * [Bulk data access to arXiv](https://arxiv.org/help/bulk_data) -* [Bulk data access to arXiv metadata via OAI-PMH](https://arxiv.org/help/oa/index) -* [Bulk data access to arXiv PDFs and source docs](https://arxiv.org/help/bulk_data_s3) +* [Bulk data access to arXiv metadata via OAI-PMH](https://arxiv.org/help/oa/index.html) +* [Bulk data access to arXiv PDFs and source docs](https://arxiv.org/help/bulk_data_s3.html) ### License diff --git a/inst/doc/aRxiv.html b/inst/doc/aRxiv.html index d91309a..634f1b6 100644 --- a/inst/doc/aRxiv.html +++ b/inst/doc/aRxiv.html @@ -14,25 +14,38 @@ aRxiv tutorial - + +code{white-space: pre-wrap;} +span.smallcaps{font-variant: small-caps;} +span.underline{text-decoration: underline;} +div.column{display: inline-block; vertical-align: top; width: 50%;} +div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} +ul.task-list{list-style: none;} + +