-
Notifications
You must be signed in to change notification settings - Fork 0
/
jRLE.cpp
410 lines (365 loc) · 14.8 KB
/
jRLE.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
/**
* Simple run-length encoder, written for an employment test.
*
* == Terminology ==
* - In this program, the term "token" refers to a description of
* a string containing one or more of a single character.
* Note: While unencoded and encoded tokens are generally distinguished,
* it is reasonable to encode an already encoded string.
* However, a string must be encoded at least once before
* it can be decoded.
* + An example of an unencoded token ("dToken"): aaa
* + An example of an encoded token ("eToken"): 3a
*
* - "Long sequence" refers to a string containing 10 or
* more of a single character.
* + An example of an unencoded long sequence: aaaaaaaaaa
* + An example of an encoded long sequence: #10a
*
* - "#-case a" refers to the special case where a token is a long sequence
* - "#-case b" where a token consists of '#' chars
* - "#-case-c" where a token consists of a number
* This case is an issue, as the token must be separated from the
* next, lest the token be confused with the next one's char count
*
* Invoke from the command line with your file path as the first argument.
* File must have the extension ".txt" and use ASCII encoding.
*
* Written by Jasper Law 2020
* https://github.com/Trimatix/cpp-run-length-encoder
*/
#include "jRLE.h"
/**
* Read the contents of the requested text file, and return it as a string
*
* @param fname path to the desired text file
* @return std::string containing the contents of the file
* @throw std::ios_base::failure if the file could not be opened
*/
std::string readFile(std::string fname) {
// Validate provided file extension
if (fname.length() < 4 ||
fname.substr(fname.length() - 4, 4) != ".txt") {
std::cerr << "Invalid file path '" + fname
+ "' - path must end with the file extension '.txt'";
throw std::invalid_argument("Invalid file path '" + fname
+ "' - path must end with the file extension '.txt'");
}
// Initialize file reader
std::ifstream freader{ fname };
// Handle invalid file paths
if (!freader) {
std::cerr << "Error opening file '" + fname
+ "' - Ensure path is correct and file is not in use.\n";
throw std::ios_base::failure("Error opening file '" + fname
+ "' - Ensure path is correct and file is not in use.");
}
// Read file contents
std::string fcontents;
std::string line;
while(std::getline(freader, line)){
fcontents += line + '\n';
}
freader.close();
return fcontents;
}
/**
* Write the passed string to the requested text file
*
* @param fname path to the desired text file
* @return std::string containing the desired contents of the file
* @throw std::ios_base::failure if the file could not be opened
*/
void writeFile(std::string fname, std::string inText) {
// Validate provided file extension
if (fname.length() < 4 ||
fname.substr(fname.length() - 4, 4) != ".txt") {
std::cerr << "Invalid file path '" + fname
+ "' - path must end with the file extension '.txt'";
throw std::invalid_argument("Invalid file path '" + fname
+ "' - path must end with the file extension '.txt'");
}
// Initialize file reader
std::ofstream fwriter{ fname };
// Handle invalid file paths
if (!fwriter) {
std::cerr << "Error opening file '" + fname
+ "' - Ensure path is correct and file is not in use.\n";
throw std::ios_base::failure("Error opening file '" + fname
+ "' - Ensure path is correct and file is not in use.");
}
// Write file contents
fwriter << inText;
fwriter.close();
}
/**
* Split an unencoded string into a vector of unencoded tokens
*
* @param inText std::string containing the unencoded text to tokenize
* @return std::vector of std::strings representing dTokens
*/
std::vector<std::string> tokenizeUnencoded(std::string inText) {
// Special case: empty string
if (inText.empty()) {
return {};
}
// Starting index of the current token in the string
int currentSeqStart{0};
// The finished vector of unencoded tokens
std::vector<std::string> dTokens{};
// Loop over all characters in the string, start with the 1st
for (int i{0}; i < inText.length(); i++) {
// If the current char is different to the previous, or if X is the
// last char in the string.
if (i == inText.length() - 1 || inText.at(i) != inText.at(i+1)) {
// Add a new token containing the chars in inText, from
// currentSeqStart to the previous char
dTokens.push_back(inText.substr(currentSeqStart,
i - currentSeqStart + 1));
// Start a new sequence from the current char
currentSeqStart = i+1;
}
}
return dTokens;
}
/**
* Split a run-length encoded string into a vector of encoded tokens
*
* @param inText std::string containing the encoded text to tokenize
* @return std::vector of std::strings representing eTokens
* @throw std::invalid_argument if a token is not accompanied by its char count
*/
std::vector<std::string> tokenizeEncoded(std::string inText) {
// Special case: empty string
if (inText.empty()) {
return {};
}
// The finished vector of eTokens
std::vector<std::string> eTokens{};
// The starting index of the current long sequence
// -1 indicates the current token is not a long sequence
int longSeqStart {-1};
// Temporary variable containing the token currently being read.
// Not always used.
std::string newToken;
// Iterate over all characters in inText
for (int i {0}; i < inText.length(); i++) {
// Temporary variable containing the currently read character
char currentChar = inText.at(i);
// If the current token is a long sequence
if (longSeqStart != -1) {
if (!isdigit(currentChar)) {
if (currentChar == '#') {
// If the next character is #
if (inText.at(i+1) == '#') {
// #-case b
// Add a new token with the chars in inText,
// from longSeqStart to char after the current
newToken = inText.substr(longSeqStart,
i - longSeqStart + 2);
// In case #-case a is also present,
// start a new long sequence
longSeqStart = i + 2;
// Skip the iteration that would handle the extra #
i++;
// If the current char is not a number or a #
} else {
// #-case a and/or #-case c
// Add a new token with the chars from longSeqStart to
// the previous char
newToken = inText.substr(longSeqStart,
i - longSeqStart);
// In case #-case a is also present in the next token,
// start a new long sequence
longSeqStart = i+1;
}
// If the current character is not a number or a #
} else {
// #-case a
// Add a new token with the chars from longSeqStart
// to the current char
newToken = inText.substr(longSeqStart,
i - longSeqStart + 1);
// No # detected, so disable long sequence tracker
longSeqStart = -1;
}
// Unless current char is EOF with no char count,
// push the new token to the vector
if (newToken != "\n") {
eTokens.push_back(newToken);
}
}
// If the current token is not a long sequence
} else {
// Start a new long sequence if one is marked
if (currentChar == '#') {
longSeqStart = i + 1;
} else if (!isdigit(inText.at(i))) {
// If a char exists that is not accompanied by a char count
// (except LFCR on EOF), the encoding is invalid. Throw error.
if (inText.at(i) != '\n' && i < inText.length() - 1) {
throw std::invalid_argument(
"Invalid encoded sequence. Non-digit char at position "
+ std::to_string(i));
}
// If current char is a digit, this is a single-char token.
} else if (inText.substr(i, 2) != "\n") {
// Add a new token containing the count and the char.
eTokens.push_back(inText.substr(i, 2));
// Skip the token-defining char.
i++;
}
}
}
return eTokens;
}
/**
* Concatenate the items in a vector of strings into a single string.
*
* @param inVect the std::vector of std::strings to concatenate
* @return All items of inVect, in order, in a single std::string
*/
std::string vectorConcatenate(std::vector<std::string> inVect) {
std::string outStr{};
for (std::string currentToken: inVect) {
outStr += currentToken;
}
return outStr;
}
/**
* Individually run-length encode each item in a vector of tokens
* Each token must only contain one or more of a single character
*
* @param dTokens vector of ASCII-encoded std::strings to run-length encode
* @return vector of encoded std::strings
*/
std::vector<std::string> encodeTokens(std::vector<std::string> dTokens) {
// Special case: empty vector
if (dTokens.empty()) {
return {};
}
// The finished vector of eTokens
std::vector<std::string> eTokens{};
// Temporary variables for the current processing and read tokens
std::string newToken{};
std::string currentToken;
// Iterate over all tokens in dTokens
for (int i{0}; i < dTokens.size(); i++) {
currentToken = dTokens.at(i);
newToken = "";
// #-case a
if (currentToken.length() > 9 ||
// OR #-case c (or both)
(i > 0 && isdigit(dTokens.at(i-1).front()))) {
// Preceed the token with a # symbol
newToken = "#";
}
// Add a new token by concatenating the length of the token with
// one of its characters
newToken += std::to_string(currentToken.length()) + currentToken.front();
// If the token consists of # characters, mark it with an extra #
if (currentToken.front() == '#') {
newToken += "#";
}
eTokens.push_back(newToken);
}
return eTokens;
}
/**
* Individually run-length decode each item in a vector of tokens
* Each token must contain an encoding of one or more of a single character
*
* @param eTokens vector of ASCII-encoded std::strings to run-length decode
* @return vector of decoded std::strings
*/
std::vector<std::string> decodeTokens(std::vector<std::string> eTokens) {
// Special case: empty vector
if (eTokens.empty()) {
return {};
}
// The finished vector of decoded tokens
std::vector<std::string> dTokens{};
// Iterate over all encoded tokens
for (int i {0}; i < eTokens.size(); i++) {
std::string currentToken = eTokens.at(i);
// If the token describes a single non-# character
if (currentToken.length() == 2){
// Add a new token by concatenating the back character x times,
// where x is the front character, converted to int
dTokens.push_back(std::string(currentToken.front() - '0',
currentToken.back()));
// #-case b
} else if (currentToken.back() == '#') {
// Assume #-case a as well to be safe.
// Add a new token by concatenating character x, y times,
// where x is the third character from the back (the back 2 are #)
// and y is all preceeding characters
dTokens.push_back(std::string(stoi(currentToken.substr(0,
currentToken.length() - 2)), '#'));
// #-case a
} else {
// Same as above, but using the last character as
// the token-defining character, and all preceeding for the count.
dTokens.push_back(std::string(stoi(currentToken.substr(0,
currentToken.length() - 1)), currentToken.back()));
}
}
return dTokens;
}
/**
* Run-length encode or decode the given text file, and write back the result.
*
* @param argc The number of arguments passed in argv
* @param argv A two-long array of strings, defining the arguments below:
* argv[0] is the function to perform: -e for encode, -d for decode.
* argv[1] is the path to the input file. File must have the extension .txt
* @throw std::invalid_argument If argv is not length 2
* @throw std::invalid_argument If argv[0] is neither -e nor -d.
*/
int main(int argc, char* argv[]) {
// Require both arguments
if (argc < 3) {
std::string errstring =
"Missing required argument. Correct command format: ";
errstring += "jRLE.exe <-e | -d> file-path";
std::cerr << errstring << std::endl;
throw std::invalid_argument(errstring);
}
std::string fileName = std::string(argv[2]);
std::string func = std::string(argv[1]);
std::string fileText = readFile(fileName);
std::vector<std::string> inTokens;
std::vector<std::string> outTokens;
std::string outText;
// Handle decoding
if (func == "-d") {
inTokens = tokenizeEncoded(fileText);
outTokens = decodeTokens(inTokens);
// Handle encoding
} else if (func == "-e") {
inTokens = tokenizeUnencoded(fileText);
outTokens = encodeTokens(inTokens);
// Handle invalid function arguments
} else {
std::cerr << "Invalid argument '" + func
+ "' - arg1 must be behaviour flag '-e' or '-d'\n";
throw std::invalid_argument("Invalid argument '" + func
+ "' - arg1 must be behaviour flag '-e' or '-d'");
}
// Concatenate and write the resulting vector to file
outText = vectorConcatenate(outTokens);
writeFile(fileName, outText);
// Report the compression ratio
float ratio;
if (func == "-e") {
ratio = static_cast<float>(fileText.length()) /
static_cast<float>(outText.length());
} else {
ratio = static_cast<float>(outText.length()) /
static_cast<float>(fileText.length());
}
std::cout << "Original file length: " + std::to_string(fileText.length())
+ "\nNew length: " + std::to_string(outText.length())
+ "\nCompression ratio: " + std::to_string(ratio);
return 0;
}