Skip to content

Commit

Permalink
Motif to ohe
Browse files Browse the repository at this point in the history
Added motif based encoding to ohe
fixed colnames() for property and ohe-based matrix
updated testthats
  • Loading branch information
ncborcherding committed May 2, 2024
1 parent 6cb4bd1 commit 4799887
Show file tree
Hide file tree
Showing 18 changed files with 57 additions and 42 deletions.
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Generated by roxygen2: do not edit by hand

S3method(mutate,sequences)
export(generate.sequences)
export(getIR)
export(mutate.sequences)
export(one.hot.encoder)
export(positional.encoder)
export(property.encoder)
Expand All @@ -12,4 +12,5 @@ importFrom(keras,array_reshape)
importFrom(methods,slot)
importFrom(stats,setNames)
importFrom(stringi,stri_rand_strings)
importFrom(stringr,str_sort)
importFrom(stringr,str_split)
44 changes: 24 additions & 20 deletions R/one.hot.encoder.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ one.hot.encoder <- function(input.sequences,
split.length = 1,
convert.to.matrix = TRUE,
sequence.dictionary = amino.acids[1:20]) {
if(split.length == 1) {
char_set <- c(sequence.dictionary, ".")
} else {
all_motifs <- expand.grid(replicate(split.length, sequence.dictionary, simplify = FALSE))
unique_motifs <- unique(apply(all_motifs, 1, paste, collapse = ""))
char_set <- c(unique_motifs, ".")

char_set <- c(sequence.dictionary, ".")

if (split.length > 1) {
all_motifs <- expand.grid(replicate(split.length, char_set, simplify = FALSE))
char_set <- unique(apply(all_motifs, 1, paste, collapse = ""))
}
# Create a mapping of amino acids to integers
char_to_int <- setNames(seq_along(char_set), char_set)
Expand All @@ -55,37 +55,41 @@ one.hot.encoder <- function(input.sequences,
print("One Hot Encoding sequences...")
onehot_sequences <- .convert.one.hot(unlist(padded_sequences),
max.length = max.length,
split.length = split.length,
char_set = char_set)

if(convert.to.matrix) {
print("Preparing a matrix...")
onehot_matrix <- array_reshape(onehot_sequences, c(dim(onehot_sequences)[1], dim(onehot_sequences)[2]*dim(onehot_sequences)[3]))
colnames(onehot_matrix) <- array.dimnamer(onehot_sequences)
return(onehot_matrix)
} else {
return(onehot_sequences)
}
}

#TODO Allow for motif or single AA encoding



.convert.one.hot <- function(sequences,
split.length = 1,
max.length,
char_set = NULL) {

.convert.one.hot <- function(sequences, split.length = 1, max.length, char_set = NULL) {
# Initialize the one-hot array with appropriate dimensions
one_hot_array <- array(0, dim = c(length(sequences), max.length, length(char_set)))
for (i in seq_len(length(sequences))) {
chars <- strsplit(sequences[i], "")[[1]]
valid_indices <- match(chars, char_set, nomatch = length(char_set) + 1) #NoMatch will not be recorded

# Extract all subsequences from each sequence
subsequences <- substring.extractor(sequences, split.length)

# Apply one-hot encoding
for (i in seq_along(subsequences)) {
chars <- subsequences[[i]]
valid_indices <- match(chars, char_set)
for(t in seq_along(chars)) {
one_hot_array[i, t, valid_indices[t]] <- 1
if (!is.na(valid_indices[t])) {
one_hot_array[i, t, valid_indices[t]] <- 1
}
}
}

dimnames(one_hot_array) <- list(paste0("Seq_", 1:length(sequences)),
paste0("Pos_", 1:max.length),
c(char_set))
dimnames(one_hot_array) <- list(paste0("Seq.", 1:length(sequences)),
paste0("Pos.", 1:max.length),
char_set)
return(one_hot_array)
}
11 changes: 4 additions & 7 deletions R/property.encoder.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,9 @@ property.encoder <- function(input.sequences,
stop(paste0("Please select one of the following for method.to.use: ", paste(sort(names(apex_AA_data)), collapse = ", ")))
}
vectors <- apex_AA_data[[method.to.use]]
#TODO Think about other normalization
vectors <- lapply(vectors, .min.max.normalize)

#TODO Add Apex AA list
#TODO Extract Vectors
#TODO Normalize Vectors


if(is.null(max.length)) {
max.length <- max(nchar(input.sequences))
}
Expand All @@ -62,6 +58,7 @@ property.encoder <- function(input.sequences,
if(convert.to.matrix) {
print("Preparing a matrix...")
property_matrix <- array_reshape(property_sequences, c(dim(property_sequences)[1], dim(property_sequences)[2]*dim(property_sequences)[3]))
colnames(property_matrix) <- array.dimnamer(property_sequences)
return(property_matrix)
} else {
return(property_sequences)
Expand All @@ -82,8 +79,8 @@ property.encoder <- function(input.sequences,
property_array[i,,] <- transformed
}

dimnames(property_array) <- list(paste0("Seq_", 1:length(sequences)),
paste0("Pos_", 1:max.length),
dimnames(property_array) <- list(paste0("Seq.", 1:length(sequences)),
paste0("Pos.", 1:max.length),
names(vectors))
return(property_array)
}
18 changes: 11 additions & 7 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,10 @@ amino.acids <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M"

}

#Split Strings by motif.length
substring.extractor <- function(strings, motif.length) {
lapply(strings, function(x) {
# Determine the length of the current string
string_length <- nchar(x)

# Calculate the number of substrings possible
num_substrings <- string_length - motif.length + 1

if (num_substrings > 0) {
# Generate all substrings of the specified length
substrings <- sapply(1:num_substrings, function(j) {
Expand All @@ -52,11 +47,20 @@ substring.extractor <- function(strings, motif.length) {
# Return NA if the string is too short
substrings <- NA
}
}) -> motif.list
return(motif.list)
substrings
})
}

.min.max.normalize <- function(x){
(x- min(x)) /(max(x)-min(x))
}

#' @importFrom stringr str_sort
array.dimnamer <- function(array) {
combinations <- expand.grid(dimnames(array)[[2]], dimnames(array)[[3]], stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE)
combinations[,1] <- str_sort(combinations[,1], numeric = TRUE)
combinations[,2] <- dimnames(array)[[3]]
combined_strings <- apply(combinations, 1, function(x) paste0(x[1], "_", x[2]))
return(combined_strings)
}

2 changes: 1 addition & 1 deletion man/mutate.sequences.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/property.encoder.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions tests/testthat/test-one.hot.encoder.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ test_that("one.hot.encoder works", {
getdata("ohe.encoder", "one.hot.encoder_matrix")
)

ohe.2mer <- one.hot.encoder(sequences,
split.length = 2)
expect_equal(
ohe.2mer,
getdata("ohe.encoder", "one.hot.encoder_2mer.matrix")
)

ohe.padded <- one.hot.encoder(sequences,
max.length = 40)

Expand Down Expand Up @@ -47,5 +54,3 @@ test_that("one.hot.encoder works", {
)

})

#TODO Add motif testing
6 changes: 5 additions & 1 deletion tests/testthat/test-property.encoder.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
test_that("property.encoder works", {

sequences <- getdata("generate.sequences", "generate.sequences_T1")


#Return Matrix
af.matrix <- property.encoder(sequences,
method.to.use = "atchleyFactors",
convert.to.matrix = TRUE)
Expand All @@ -13,6 +14,7 @@ test_that("property.encoder works", {
getdata("property.encoder", "property.encoder_AtchleyFactors_matrix")
)

#Return Array
kf.array <- property.encoder(sequences,
method.to.use = "kideraFactors",
convert.to.matrix = FALSE)
Expand All @@ -22,6 +24,7 @@ test_that("property.encoder works", {
getdata("property.encoder", "property.encoder_KideraFactors_array")
)

#Padded Matrix
fasgai.matrix <- property.encoder(sequences,
max.length = 40,
method.to.use = "FASGAI",
Expand All @@ -32,6 +35,7 @@ test_that("property.encoder works", {
getdata("property.encoder", "property.encoder_FASGAI_matrix")
)

#Padded Array
vhse.array <- property.encoder(sequences,
max.length = 40,
method.to.use = "VHSE",
Expand Down
Binary file not shown.
Binary file modified tests/testthat/testdata/ohe.encoder/one.hot.encoder_array.rds
Binary file not shown.
Binary file modified tests/testthat/testdata/ohe.encoder/one.hot.encoder_matrix.rds
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 4799887

Please sign in to comment.