-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathRegular_expressions.R
103 lines (73 loc) · 3.34 KB
/
Regular_expressions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
###### PART1: String functions related to regular expression
txt <- c("The", "R", "Foundation", "for Statistical Computing");
# Type1: identify match to a pattern
grep("Found", txt, value=F);
grepl("found", txt, ignore.case = T);
# Type2: extract match to a pattern
grep("Found", txt, value=T);
# Type3: replace a pattern
gsub("a", "E", "abc abc")
sub("a", "E", "abc abc")
# Type4: split a string using a patter
strsplit(x="abcd",split="b");
###### PART2: Regular expression syntax
#1. Escape sequences
# \': single quote. You don’t need to escape single quote inside a double-quoted string, so we can also use "'" in the previous example.
# \": double quote. Similarly, double quotes can be used inside a single-quoted string, i.e. '"'.
# \n: newline.
# \t: tab character
cat("\"a\nb\tc\'")
#2 Quantifiers
# *: matches at least 0 times.
# +: matches at least 1 times.
# ?: matches at most 1 times.
# {n}: matches exactly n times.
# {n,}: matches at least n times.
# {n,m}: matches between n and m times.
strings <- c("a", "ab", "acb", "accb", "acccb", "accccb")
grep("ac*b", strings, value = TRUE)
grep("ac+b", strings, value = TRUE)
grep("ac?b", strings, value = TRUE)
grep("ac{2}b", strings, value = TRUE)
grep("ac{2,}b", strings, value = TRUE)
grep("ac{2,3}b", strings, value = TRUE)
# 3. Position of pattern within string
strings <- c("abcd", "cdab", "cabd");
grep("^ab", strings, value = TRUE);
grep("ab$", strings, value = TRUE)
# 4. Operators
# .: matches any single character
# [...]: a character list, matches any one of the characters inside the square brackets
# [^...]: an inverted character list, similar to [...],
# but matches any characters except those inside the square brackets.
# |: an “or” operator, matches patterns on either side of the |.
# (...): grouping in regular expressions. This allows you to retrieve the bits that matched various parts of your regular expression so you can alter them or use them for building up a new string. Each group can than be refer using \\N, with N being the No. of (...) used. This is called
strings <- c("^ab", "ab", "abc", "abd", "abe");
grep("ab.", strings, value = TRUE)
grep("ab[c-e]", strings, value = TRUE)
grep("ab[^c]", strings, value = TRUE)
grep("^ab", strings, value = TRUE)
grep("abc|abd", strings, value = TRUE)
gsub("(ab)", "\\1\\1", strings)
# PART3: Exercises
#1. find the indices of strings ending with "at"
# HINT: at
array <- c("hat", "cat", "at home", "hate");
result <- ();
#2. double all 'a' or 'b's in the following string
# add a hyphen after each 'a' or 'b's,
# ignore case
# HINT: gsub
string <- "abc ABC";
result <- "a_a_b_b_c A_A_B_B_C"
## Double all 'a' or 'b's; "\" must be escaped, i.e., 'doubled'
#3. Make up a question for your neighbor and then answer their question
#########################
# ANSWER FOR EXERCISES
#########################
# EX1
grep("at$", array, value = FALSE)
# EX2
a|b
gsub("([ab])", "\\1_\\1_", string, ignore.case=T)
# Reference http://stat545-ubc.github.io/block022_regular-expression.html#general-modes-for-patterns