-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathcreateGrams.R
37 lines (32 loc) · 1.1 KB
/
createGrams.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# start <- proc.time()
#
# ## Java stuff
# Sys.setenv(JAVA_HOME="")
# options(java.parameters="-Xmx14g")
#
# ## load RWeka, tm libs
# library(RWeka)
# library(tm)
#
# ## create a tokenizer for up to 2-Grams
# tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
# en_texts <- VCorpus(DirSource(directory="2_train/", encoding="UTF-8"))
#
# ## create the TermDocumentMatrix
# tdm <- TermDocumentMatrix(en_texts, control=list(tokenizer=tokenizer))
#
# ## show the running time
# proc.time() - start
## special treatment, 1-Grams, problem with the RWeka above, it doesn't return the 1-letter and 2-letter words.
create1grams <- function(fileInput) {
## get the lines from the file.
input <- file(fileInput, "rb", encoding = "UTF-8")
allLines <- readLines(input, skipNul = TRUE, encoding = "UTF-8")
allWords <- unlist(strsplit(allLines, split = " "))
close(input)
t <- table(allWords)
return(t)
}
##Rprof(filename = "out.txt", interval = 0.005)
##v <- create1grams("test.txt")
##Rprof(NULL)