-
Notifications
You must be signed in to change notification settings - Fork 0
/
practicum3-notebook.Rmd
1877 lines (1566 loc) · 53.3 KB
/
practicum3-notebook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: 'CS5200 Fall 2020: Practicum 3'
author: "Chandra Davis, Evan Douglass"
output:
pdf_document: default
word_document: default
html_document:
df_print: paged
---
## Overview
We've decided to work with SQLite for this practicum. As such, to work with these files you will need SQLite installed on your machine. The data we are using was provided with the practicum
```{r, warning=FALSE, message=FALSE}
# Libraries needed for processing
library(RSQLite)
library("XML")
library(sqldf)
library(dplyr)
library(tibble)
library(sjmisc)
library(ggplot2)
# Set to False if you do not want to print the top 5 from each table in the transactional database
printHead = TRUE
```
## Part 1
Create a normalized relational OLTP database and populate it with data from an XML document.
### Task 1
Create a normalized relational schema that contains minimally the following entities: Article, Journal, Author, History. Use the XML document to determine the appropriate attributes (fields/columns) for the entities (tables). While there may be other types of publications in the XML, you only need to deal with articles in journals. Create appropriate primary and foreign keys. Where necessary, add surrogate keys. Include an image of an ERD showing your model in your R Notebook.
Lucidchart link:
![Task1.1](imgs/CS5200 - Practicum 3 ERD.png)
### Task 2
Realize the relational schema in SQLite (place the CREATE TABLE statements into SQL chunks in your R Notebook).
```{r}
DB_NAME <- "pubMed.db"
conn <- dbConnect(RSQLite::SQLite(), DB_NAME)
```
```{r, results="hide"}
# Since the dataset is small, the database should be re-created at runtime
drop_table <- function(table_name) {
paste("DROP TABLE IF EXISTS ", table_name, ";", sep="")
}
# Since we are dropping all tables, disable the FK checks
dbExecute(conn, "PRAGMA foreign_keys = OFF;")
# Get a list of all tables currently in the database
table_list <- dbListTables(conn)
# Drop every table in the database
for(table in table_list){
if(!str_contains(table,"sqlite")){
dbExecute(conn, drop_table(table))
}
}
dbExecute(conn, "PRAGMA foreign_keys = ON;")
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS ELocType (
eType_id INTEGER PRIMARY KEY AUTOINCREMENT,
eType TEXT NOT NULL,
CONSTRAINT unique_eLocType_eType UNIQUE (eType)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS CategoryLabels (
label_id INTEGER PRIMARY KEY AUTOINCREMENT,
label TEXT NOT NULL,
CONSTRAINT unique_catLab_label UNIQUE (label)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Affiliations (
aff_id INTEGER PRIMARY KEY AUTOINCREMENT,
aff TEXT NOT NULL,
CONSTRAINT unique_aff_aff UNIQUE (aff)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Pagination (
pgn_id INTEGER PRIMARY KEY AUTOINCREMENT,
medlinePgn TEXT NOT NULL,
CONSTRAINT unique_pgn_medpgn UNIQUE (medlinePgn)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Languages (
lang_id INTEGER PRIMARY KEY AUTOINCREMENT,
language TEXT NOT NULL,
CONSTRAINT unique_lang_lang UNIQUE (language)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS IsoAbbreviation (
abbr_id INTEGER PRIMARY KEY AUTOINCREMENT,
abbr TEXT NOT NULL,
CONSTRAINT unique_isoAbbr_abbr UNIQUE (abbr)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS MediumType (
medium_id INTEGER PRIMARY KEY AUTOINCREMENT,
medium TEXT NOT NULL,
CONSTRAINT unique_med_med UNIQUE (medium)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Countries (
country_id INTEGER PRIMARY KEY AUTOINCREMENT,
country TEXT NOT NULL,
CONSTRAINT unique_country_country UNIQUE (country)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Agencies (
agency_id INTEGER PRIMARY KEY AUTOINCREMENT,
agency TEXT NOT NULL,
CONSTRAINT unique_agency_agency UNIQUE (agency)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Acronyms (
acronym_id INTEGER PRIMARY KEY AUTOINCREMENT,
acronym TEXT NOT NULL,
CONSTRAINT unique_acr_acr UNIQUE (acronym)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS PublicationType (
pubType_id INTEGER PRIMARY KEY AUTOINCREMENT,
pubType TEXT NOT NULL,
CONSTRAINT unique_pubType_pubType UNIQUE (pubType)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS PubStatus (
status_id INTEGER PRIMARY KEY AUTOINCREMENT,
status TEXT NOT NULL,
CONSTRAINT unique_pubStatus_status UNIQUE (status)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Date (
date_id INTEGER PRIMARY KEY AUTOINCREMENT,
year INTEGER NOT NULL,
month INTEGER NOT NULL,
day INTEGER NOT NULL,
CONSTRAINT unique_date_yymmdd UNIQUE (year, month, day)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Article (
article_id INTEGER PRIMARY KEY AUTOINCREMENT,
pubModel INTEGER NOT NULL,
title Text NOT NULL,
pgn_id INTEGER NOT NULL,
authorListComplete BOOLEAN NOT NULL,
lang_id INTEGER NOT NULL,
grantListComplete BOOLEAN,
copyright Text,
FOREIGN KEY (pubModel) REFERENCES MediumType(medium_id),
FOREIGN KEY (pgn_id) REFERENCES Pagination(pgn_id),
FOREIGN KEY (lang_id) REFERENCES Languages(lang_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS ArticleDate (
artDate_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
dateType INTEGER NOT NULL,
date_id INTEGER NOT NULL,
FOREIGN KEY (article_id) REFERENCES Article(article_id),
FOREIGN KEY (dateType) REFERENCES MediumType(medium_id),
FOREIGN KEY (date_id) REFERENCES Date(date_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS PubTypeList (
pubList_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
pubType_id INTEGER NOT NULL,
FOREIGN KEY (pubType_id) REFERENCES PublicationType(pubType_id),
FOREIGN KEY (article_id) REFERENCES Article(article_id),
CONSTRAINT unique_pubTypeList_type_art UNIQUE (pubType_id, article_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS ELocationID (
eLoc_id INTEGER PRIMARY KEY AUTOINCREMENT,
eType_id INTEGER NOT NULL,
valid BOOLEAN NOT NULL,
value Text NOT NULL,
FOREIGN KEY (eType_id) REFERENCES ELocType(eType_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS ELocList (
elList_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
eLoc_id INTEGER NOT NULL,
FOREIGN KEY (eLoc_id) REFERENCES ELocationID(eLoc_id),
FOREIGN KEY (article_id) REFERENCES Article(article_id),
CONSTRAINT unique_eloclist_eLoc_art UNIQUE (eLoc_id, article_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS PubDate (
pubDate_id INTEGER PRIMARY KEY AUTOINCREMENT,
medlineDate BOOLEAN NOT NULL,
dateYYMM Text NOT NULL,
CONSTRAINT unique_pubDate_med_date UNIQUE (medlineDate, dateYYMM)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS JournalIssue (
issue_id INTEGER PRIMARY KEY AUTOINCREMENT,
volume INTEGER NOT NULL,
issue INTEGER NOT NULL,
citedMedium INTEGER NOT NULL,
pubDate_id INTEGER NOT NULL,
FOREIGN KEY (citedMedium) REFERENCES MediumType(medium_id),
FOREIGN KEY (pubDate_id) REFERENCES PubDate(pubDate_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS ArticleList (
artList_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
issue_id INTEGER NOT NULL,
FOREIGN KEY (issue_id) REFERENCES JournalIssue(issue_id),
FOREIGN KEY (article_id) REFERENCES Article(article_id),
CONSTRAINT unique_artlist_issue_art UNIQUE (issue_id, article_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS ISSN (
issn_id INTEGER PRIMARY KEY AUTOINCREMENT,
value Text NOT NULL,
issnType INTEGER NOT NULL,
CONSTRAINT unique_issn_typeissn UNIQUE (issnType, value),
FOREIGN KEY (issnType) REFERENCES MediumType(medium_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Journal (
journal_id INTEGER PRIMARY KEY AUTOINCREMENT,
title Text NOT NULL,
issn_id INTEGER NOT NULL,
isoAbbr INTEGER NOT NULL,
FOREIGN KEY (issn_id) REFERENCES ISSN(issn_id),
FOREIGN KEY (isoAbbr) REFERENCES IsoAbbreviation(abbr_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS IssueList (
issueList_id INTEGER PRIMARY KEY AUTOINCREMENT,
issue_id INTEGER NOT NULL,
journal_id INTEGER NOT NULL,
FOREIGN KEY (issue_id) REFERENCES JournalIssue(issue_id),
FOREIGN KEY (journal_id) REFERENCES Journal(journal_id),
CONSTRAINT unique_issuelist_issue_jornal UNIQUE (issue_id, journal_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS AbstractText (
abText_id INTEGER PRIMARY KEY AUTOINCREMENT,
abText Text NOT NULL,
label INTEGER,
nlmCategory INTEGER,
FOREIGN KEY (label) REFERENCES CategoryLabels(label_id),
FOREIGN KEY (nlmCategory) REFERENCES CategoryLabels(label_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS AbstractList (
abList_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
abText_id INTEGER NOT NULL,
FOREIGN KEY (abText_id) REFERENCES AbstractText(abText_id),
FOREIGN KEY (article_id) REFERENCES Article(article_id),
CONSTRAINT unique_ablist_abstract_art UNIQUE (abText_id, article_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Authors (
author_id INTEGER PRIMARY KEY AUTOINCREMENT,
valid BOOLEAN NOT NULL,
lName Text NOT NULL,
fName Text NOT NULL,
initials Text NOT NULL,
aff_id INTEGER,
FOREIGN KEY (aff_id) REFERENCES Affiliations(aff_id),
CONSTRAINT unique_auth_authors UNIQUE (lName, fName, initials, aff_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS AuthorList (
authList_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
author_id INTEGER NOT NULL,
FOREIGN KEY (author_id) REFERENCES Authors(author_id),
FOREIGN KEY (article_id) REFERENCES Article(article_id),
CONSTRAINT unique_authlist_author_art UNIQUE (author_id, article_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS Grants (
grant_id INTEGER PRIMARY KEY AUTOINCREMENT,
grantID Text NOT NULL,
acronym_id INTEGER,
agency_id INTEGER NOT NULL,
country_id INTEGER NOT NULL,
FOREIGN KEY (acronym_id) REFERENCES Acronyms(acronym_id),
FOREIGN KEY (agency_id) REFERENCES Agencies(agency_id),
FOREIGN KEY (country_id) REFERENCES Countries(country_id),
CONSTRAINT unique_grant_id UNIQUE (grantID)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS GrantList (
grantList_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
grant_id INTEGER NOT NULL,
FOREIGN KEY (grant_id) REFERENCES Grants(grant_id),
FOREIGN KEY (article_id) REFERENCES Article(article_id),
CONSTRAINT unique_grantlist_grant_art UNIQUE (grant_id, article_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS PubMedDate (
pubMedDate_id INTEGER PRIMARY KEY AUTOINCREMENT,
status INTEGER NOT NULL,
date INTEGER NOT NULL,
hour INTEGER,
minute INTEGER,
FOREIGN KEY (status) REFERENCES PubStatus(status_id),
FOREIGN KEY (date) REFERENCES Date(date_id)
);
```
```{sql connection=conn}
CREATE TABLE IF NOT EXISTS History (
history_id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
pubMedDate_id INTEGER NOT NULL,
FOREIGN KEY (pubMedDate_id) REFERENCES PubMedDate(pubMedDate_id),
FOREIGN KEY (article_id) REFERENCES Article(article_id),
CONSTRAINT unique_history_date_art UNIQUE (pubMedDate_id, article_id)
);
```
### Task 3
Extract and transform the data from the XML and then load into the appropriate tables in the database. You cannot use xmlToDataFrame but instead must parse the XML node by node using a combination of node-by-node tree traversal and XPath. It is not feasible to use XPath to extract all journals, then all authors, etc. as some are missing and won't match up. You will need to iterate through the top-level nodes.
```{r, warning=FALSE}
# Read the XML file into memory
file <- xmlParse(file = "pubmed_sample.xml")
# Get the first node
root <- xmlRoot(file)
```
```{r, warning=FALSE}
# Categorical tables need to be unique so do not need to go node by node
# There are 12 categorical attribute tables
# Created function for duplicate processing
getVector <- function(path, att){
# Grab all of the unique genres
if(att){
values <- unique(xpathSApply(file, path))
}
else{
values <- unique(xpathSApply(file, path, xmlValue))
}
return(values)
}
addId <- function(df){
# Add unique ids
df <- tibble::rowid_to_column(df, "id")
return(df)
}
dfToDatabase <- function(df, tablename, needID){
if(needID){
df_table <- addId(df)
}
else {
df_table <- df
}
# Update column names to match table
names <- dbGetQuery(conn,
paste("pragma table_info(",tablename,")",sep=""))[["name"]]
colnames(df_table) <- names
# Put in database
# Append=TRUE is the preferred method because we want to maintain the referential integrity relationships from table creation
dbWriteTable(conn, tablename, df_table, append=TRUE)
# Return the dataframe for processing FK relationships
return(df_table)
}
catTable <- function(path, tablename, att, mult=FALSE){
# A few of the categorical tables are made of multiple XML elements/attributes
if(mult){
values <- c()
split <- unlist(strsplit(path, ","))
for(i in split){
values <- append(values,getVector(i,att))
}
values <- unique(values)
}
else {
values <- getVector(path, att)
}
df_table <- data.frame(values)
return(dfToDatabase(df_table,tablename, TRUE))
}
```
```{r}
# Call function to build all twelve tables
# Save the dataframes for look-up in other tables
df_aff <- catTable("//Affiliation", "Affiliations", FALSE)
df_pgn <- catTable("//MedlinePgn", "Pagination", FALSE)
df_lang <- catTable("//Language", "Languages", FALSE)
df_iso <- catTable("//ISOAbbreviation", "IsoAbbreviation", FALSE)
df_pubType <- catTable("//PublicationType", "PublicationType", FALSE)
df_country <- catTable("//Country", "Countries", FALSE)
df_agency <- catTable("//Agency", "Agencies", FALSE)
df_acr <- catTable("//Acronym", "Acronyms", FALSE)
df_eIdType <- catTable("//@EIdType", "ELocType", TRUE)
df_pubStatus <- catTable("//@PubStatus", "PubStatus", TRUE)
df_labels <- catTable("//@Label,//@NlmCategory", "CategoryLabels", TRUE, TRUE)
df_mediums <- catTable("//@IssnType,//@CitedMedium,//@PubModel,//@DateType",
"MediumType", TRUE, TRUE)
```
The remaining tables need to be processed node by node because of potentially extra or missing nodes. Not all of the tables/attributes will have a 1:1 relationship with the article.
The next few chunks will read the XML into initial dataframes for further manipulation.
```{r}
# Function to check for missing values
# Will add a NULL value to the vector if the data does not exist
checkSize <- function(num, df){
if(length(df) < num){
return(append(df,NA))
}
else {
return(df)
}
}
```
```{r}
# Function processes the history XML element
# Uses the article number for tying back to the original article
histDF <- function(art, node){
status <- c()
year <- c()
month <- c()
day <- c()
hour <- c()
minute <- c()
for(i in 1:xmlSize(node)){
newNode <- node[[i]]
# Status is an attribute and values are obtained differently
status <- append(status,xmlGetAttr(newNode,"PubStatus",NA))
for(j in 1:xmlSize(newNode)){
child <- newNode[[j]]
name <- xmlName(child)
# Add to the appropriate vector depending on child name
if(name == "Year"){
year <- append(year, xmlValue(child))
}
else if(name == "Month"){
month <- append(month, xmlValue(child))
}
else if(name == "Day"){
day <- append(day, as.numeric(xmlValue(child)))
}
else if(name == "Hour"){
hour <- append(hour, xmlValue(child))
}
else if(name == "Minute"){
minute <- append(minute, as.numeric(xmlValue(child)))
}
else {
print(paste("Vector not created for:", name))
}
}
# Check for missing values on all vectors
status <- checkSize(i,status)
year <- checkSize(i,year)
month <- checkSize(i,month)
day <- checkSize(i,day)
hour <- checkSize(i,hour)
minute <- checkSize(i,minute)
}
hist_df <- data.frame("article_id"=art,
status, year, month, day, hour, minute)
return(hist_df)
}
# Create an empty dataframe to match dataframe from function
# History is the linking table between article and PubMedDate
df_pmDate <- data.frame("article_id"=c(), "status"=c(),
"year"=c(), "month"=c(), "day"=c(), "hour"=c(),
"minute"=c())
# Confirmed only one instance of history for every article
# Goes through every top level article
# Creates a combined dataframe for tables relating to the history
for(i in 1:xmlSize(root)){
newRoot <- root[[i]][[2]]
for(j in 1:xmlSize(newRoot)){
name <- xmlName(newRoot[[j]])
if(name=="History"){
df_pmDate <- rbind(df_pmDate,histDF(i,newRoot[[j]]))
}
}
}
```
```{r}
# Function processes the journal XML element
# Uses the article number for tying back to the original article
journalDF <- function(num, node){
issnType <- c()
issn <- c()
medium <- c()
volume <- c()
issue <- c()
med <- c()
date <- c()
title <- c()
iso <- c()
for(i in 1:xmlSize(node)){
newNode <- node[[i]]
name <- xmlName(newNode)
if(name=="ISSN"){
issnType <- append(issnType, xmlGetAttr(newNode,"IssnType",NA))
issn <- append(issn, xmlValue(newNode))
}
else if(name=="JournalIssue"){
medium <- append(medium, xmlGetAttr(newNode,"CitedMedium",NA))
for(j in 1:xmlSize(newNode)){
child <- newNode[[j]]
name2 <- xmlName(child)
if(name2 == "Volume"){
volume <- append(volume, as.numeric(xmlValue(child)))
}
else if(name2 == "Issue"){
issue <- append(issue, as.numeric(xmlValue(child)))
}
else if(name2 == "PubDate"){
if(xmlName(child[[1]])=="MedlineDate"){
med <- append(med, "Y")
date <- append(date, xmlValue(child[[1]]))
}
else {
med <- append(med, "N")
year <- xmlValue(child[[1]])
month <- xmlValue(child[[2]])
date <- append(date, paste(year,month))
}
}
else {
print(paste("Vector not created for: JournalIssue/", name2))
}
}
}
else if(name=="Title"){
title <- append(title, xmlValue(newNode))
}
else if(name=="ISOAbbreviation"){
iso <- append(iso, xmlValue(newNode))
}
else {
print(paste("Vector not created for:", name))
}
}
journal_df <- data.frame("article_id"=num, issnType, issn, medium, volume,
issue, med, date, title, iso)
return(journal_df)
}
```
```{r}
# Function processes the eLocationID XML element
# Uses the article number for tying back to the original article
eLocDF <- function(num, node){
valid <- xmlGetAttr(node,"ValidYN",NA)
type <- xmlGetAttr(node,"EIdType",NA)
value <- xmlValue(node)
eLoc_df <- data.frame("article_id"=num, valid, type, value)
return(eLoc_df)
}
```
```{r}
# Function processes the abstract XML element
# Uses the article number for tying back to the original article
abstractDF <- function(num, node){
cat <- c()
lab <- c()
text <- c()
for(i in 1:xmlSize(node)){
newNode <- node[[i]]
name <- xmlName(newNode)
if(name=="AbstractText"){
cat <- append(cat, xmlGetAttr(newNode,"NlmCategory",NA))
lab <- append(lab, xmlGetAttr(newNode,"Label",NA))
text <- append(text, xmlValue(newNode))
# Check for missing values on all vectors
cat <- checkSize(i,cat)
lab <- checkSize(i,lab)
text <- checkSize(i,text)
}
}
abstract_df <- data.frame("article_id"=num, cat, lab, text)
return(abstract_df)
}
```
```{r}
# Function processes the authorList XML element
# Uses the article number for tying back to the original article
authorDF <- function(num, node){
valid <- c()
last <- c()
first <- c()
init <- c()
aff <- c()
for(i in 1:xmlSize(node)){
newNode <- node[[i]]
valid <- append(valid, xmlGetAttr(newNode,"ValidYN",NA))
for(j in 1:xmlSize(newNode)){
child <- newNode[[j]]
name <- xmlName(child)
if(name == "LastName"){
last <- append(last, xmlValue(child))
}
else if(name == "ForeName"){
first <- append(first, xmlValue(child))
}
else if(name == "Initials"){
init <- append(init, xmlValue(child))
}
else if(name == "Affiliation"){
aff <- append(aff, xmlValue(child))
}
else {
print(paste("Vector not created for:", name))
}
}
# Check for missing values on all vectors
valid <- checkSize(i,valid)
last <- checkSize(i,last)
first <- checkSize(i,first)
init <- checkSize(i, init)
aff <- checkSize(i,aff)
}
author_df <- data.frame("article_id"=num, valid, last, first, init, aff)
return(author_df)
}
```
```{r}
# Function processes the publicationTypeList XML element
# Uses the article number for tying back to the original article
pubTypeDF <- function(num, node){
type <- c()
for(i in 1:xmlSize(node)){
type <- append(type, xmlValue(node[[i]]))
}
return(data.frame("article_id"=num, type))
}
```
```{r}
# Function processes the grantList XML element
# Uses the article number for tying back to the original article
grantDF <- function(num, node){
grantid <- c()
ac <- c()
ag <- c()
ctry <- c()
for(i in 1:xmlSize(node)){
newNode <- node[[i]]
for(j in 1:xmlSize(newNode)){
child <- newNode[[j]]
name <- xmlName(child)
if(name == "GrantID"){
grantid <- append(grantid, xmlValue(child))
}
else if(name == "Acronym"){
ac <- append(ac, xmlValue(child))
}
else if(name == "Agency"){
ag <- append(ag, xmlValue(child))
}
else if(name == "Country"){
ctry <- append(ctry, xmlValue(child))
}
else {
print(paste("Vector not created for:", name))
}
}
# Check for missing values on all vectors
grantid <- checkSize(i,grantid)
ac <- checkSize(i,ac)
ag <- checkSize(i,ag)
ctry <- checkSize(i,ctry)
}
grant_df <- data.frame("article_id"=num, grantid, ac, ag, ctry)
return(grant_df)
}
```
```{r}
# Function processes the articleDate XML element
# Uses the article number for tying back to the original article
artDateDF <- function(num, node){
type <- xmlGetAttr(node,"DateType",NA)
year <- as.numeric(xmlValue(node[[1]]))
month <- as.numeric(xmlValue(node[[2]]))
day <- as.numeric(xmlValue(node[[3]]))
artDate_df <- data.frame("article_id"=num, type, year, month, day)
return(artDate_df)
}
```
```{r}
# Create vectors needed for the article dataframe
id <- c()
title <- c()
pagination <-c()
aListC <- c()
gListC <- c()
lang <- c()
model <- c()
copyright <-c()
# Create an empty dataframes to match above functions
df_journal <- data.frame("article_id"=c(), "issnType"=c(), "issn"=c(),
"medium"=c(), "volume"=c(), "issue"=c(), "med"=c(),
"date"=c(), "title"=c(), "iso"=c())
df_eLoc <- data.frame("article_id"=c(), "valid"=c(), "type"=c(), "value"=c())
df_abstract <- data.frame("article_id"=c(), "cat"=c(), "lab"=c(), "text"=c())
df_author <- data.frame("article_id"=c(), "valid"=c(), "last"=c(),
"first"=c(), "init"=c(), "aff"=c())
df_pubTypeList <- data.frame("article_id"=c(), "type"=c())
df_grant <- data.frame("article_id"=c(), "grantid"=c(), "ac"=c(), "ag"=c(),
"ctry"=c())
df_artDate <- data.frame("article_id"=c(), "type"=c(), "year"=c(),
"month"=c(), "day"=c())
# Confirmed only one instance of article for every PubMedArticle
for(i in 1:xmlSize(root)){
newRoot <- root[[i]][[1]]
for(j in 1:xmlSize(newRoot)){
name <- xmlName(newRoot[[j]])
if(name=="Article"){
article <- newRoot[[j]]
id <- append(id,i)
model <- append(model,xmlGetAttr(article,"PubModel",NA))
for(k in 1:xmlSize(article)){
child <- article[[k]]
name <- xmlName(child)
if(name=="Journal"){
df_journal <- rbind(df_journal,journalDF(i,child))
}
else if(name=="ArticleTitle"){
title <- append(title,xmlValue(child))
}
else if(name=="Pagination"){
pagination <- append(pagination,xmlValue(child[[1]]))
}
else if(name=="ELocationID"){
df_eLoc <- rbind(df_eLoc,eLocDF(i,child))
}
else if(name=="Abstract"){
value <- xpathSApply(child,"CopyrightInformation", xmlValue)
if(class(value)=="list"){
copyright <- append(copyright,NA)
}
else {
copyright <- append(copyright,value)
}
df_abstract <- rbind(df_abstract,abstractDF(i,child))
}
else if(name=="AuthorList"){
aListC <- append(aListC,xmlGetAttr(child,"CompleteYN",NA))
df_author <- rbind(df_author,authorDF(i,child))
}
else if(name=="Language"){
lang <- append(lang,xmlValue(child))
}
else if(name=="GrantList"){
gListC <- append(gListC,xmlGetAttr(child,"CompleteYN",NA))
df_grant <- rbind(df_grant,grantDF(i,child))
}
else if(name=="PublicationTypeList"){
df_pubTypeList <- rbind(df_pubTypeList,pubTypeDF(i,child))
}
else if(name=="ArticleDate"){
df_artDate <- rbind(df_artDate,artDateDF(i,child))
}
else {
print(paste("Missing tables for",name))
}
}
# Check for missing values on all vectors
id <- checkSize(i,id)
title <- checkSize(i,title)
pagination <- checkSize(i,pagination)
aListC <- checkSize(i,aListC)
gListC <- checkSize(i,gListC)
lang <- checkSize(i,lang)
model <- checkSize(i,model)
copyright <- checkSize(i, copyright)
}
}
}
df_article <- data.frame(i, model, title, pagination, aListC, lang,
gListC, copyright)
```
Now that all of the data has been read, the dataframes should be manipulated to match the tables. Since the dfToDatabase function will rename the columns, the format has to be exact for the data to be added properly.
```{r}
# Replace the applicable categorical attribute with the FK id
fkReplacement <- function(df_cat, df_list, catCol, listCol, name){
# Get a list of the categories available
types <- df_cat[,c(0,2)]
# For each category, assign the FK id
for(term in types) {
search1 <- df_cat[,catCol] == term
# Get the FK id for the category
id <- df_cat[which(search1),][[paste0(name,"_id")]]
search2 <- df_list[,listCol] == term
# Replace default 0 value with the FK id
df_list$update_id[search2] <- id
}
# Drop the original categorical attribute column
df_list[,listCol] <- NULL
# Rename the FK id column to match attribute
names(df_list)[names(df_list) == "update_id"] <- paste0(name,"_id")
return(df_list)
}
```
```{r, messge=FALSE}
# Replace FK attributes
df_article <- fkReplacement(df_mediums,df_article,2,2,"medium")
df_article <- fkReplacement(df_pgn,df_article,2,3,"pgn")
df_article <- fkReplacement(df_lang,df_article,2,4,"lang")
# Organize columns into correct order (needed for adding to database)
df_article <- data.frame(cbind(id, df_article$medium_id,
df_article$title, df_article$pgn_id,
df_article$aListC, df_article$lang_id,
df_article$gListC, df_article$copyright))
# Add the values to the database
df_article <- dfToDatabase(df_article,"Article",FALSE)
```
```{r}
# Replace FK attributes
df_pubTypeList <- fkReplacement(df_pubType,df_pubTypeList,2,2,"pubType")
# Add the values to the database
df_pubTypeList <- dfToDatabase(df_pubTypeList,"PubTypeList",TRUE)
```
```{r}
# Need to add the unique id key so the bridge list can be made
df_eLoc <- addId(df_eLoc)
# Bridge list should only have the article and the unique ids
df_eLocList <- df_eLoc[,c(2,1)]
colnames(df_eLocList) <- c("art","eLoc")
# Replace FK attributes
df_eLoc <- fkReplacement(df_eIdType,df_eLoc,2,4,"eType")
# Organize columns into correct order (needed for adding to database)
df_eLoc <- data.frame(cbind(df_eLoc$id, df_eLoc$eType_id, df_eLoc$valid,
df_eLoc$value))
# Add the values to the database
df_eLoc <- dfToDatabase(df_eLoc,"ELocationID",FALSE)
df_eLocList <- dfToDatabase(df_eLocList,"ELocList",TRUE)