-
Notifications
You must be signed in to change notification settings - Fork 0
/
metadata-preprocessing.R
55 lines (40 loc) · 1.75 KB
/
metadata-preprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
## Extract and format block diagram data for series-name indexing
## 2017-07-06
## D.E. Beaudette
library(plyr)
library(stringr)
# source data (verified July 2017)
# http://www.nrcs.usda.gov/Internet/FSE_DOCUMENTS/nrcs142p2_052891.xls
# trim-down and add column names in spreadsheet software
# 2017: file names are now links
x <- read.csv('Metadata-diagrams.csv', as.is=TRUE, stringsAsFactors=FALSE)
# replace semi-colon with comma
x$series <- str_replace_all(x$series, ";", ",")
# strip URL from file names
# 2017: http://www.nrcs.usda.gov/Internet/NRCS_DIAGRAMS/graphics/AK-2011-05-27-01.tif
x$file <- str_replace_all(x$file, 'http://www.nrcs.usda.gov/Internet/NRCS_DIAGRAMS/graphics/', '')
## TODO: figure out why these are missing series names... maybe we can add them
idx <- which(x$series == '')
write.csv(x[idx, ], file='missing-series-names.csv', row.names=FALSE)
# remove those records without a soil series
x <- x[-idx, ]
# split soil series list by file
x.soils <- strsplit(x$series, ',')
# trim white space from soil series names
x.soils.trimmed <- llply(x.soils, .fun=function(i) {
sapply(i, function(j) gsub('^[[:space:]]+', '', j))
})
# init empty list, and fill will dataframes containing long-formatted version
l <- list()
for(i in seq_along(1:nrow(x))) {
s <- x.soils.trimmed[[i]]
d <- data.frame(file=x$file[i], series=s)
l[[i]] <- d
}
# convert to data frame, and enforce uniqueness
x.long <- ldply(l)
x.long.unique <- unique.data.frame(x.long)
# write main LU table, indexed by series name
write.csv(x.long.unique, file='block_diagram_lu_table.csv', row.names=FALSE)
# retain metadata table, indexed by file name
write.csv(x[, c('file', 'survey_area', 'state', 'issue_date', 'caption')], file='block_diagram_metadata.csv', row.names=FALSE)