From c5b3b65a32a0a0917a39376e832ece2fa352c69f Mon Sep 17 00:00:00 2001 From: Victor Manuel Alvarez Date: Mon, 2 Dec 2013 23:39:15 +0100 Subject: [PATCH] Improve yr_re_execute with a non-recursive algorithm --- libyara/re.c | 511 +++++++++++++++++++++++---------------------------- 1 file changed, 232 insertions(+), 279 deletions(-) diff --git a/libyara/re.c b/libyara/re.c index 5596c0fb4d..0efeeb3e66 100644 --- a/libyara/re.c +++ b/libyara/re.c @@ -41,7 +41,6 @@ order to avoid confusion with operating system threads. #include "re.h" -#define MAX_RE_FIBERS 1024 #define MAX_RE_STACK 1024 #define RE_SCAN_LIMIT 65535 @@ -53,55 +52,30 @@ order to avoid confusion with operating system threads. #define min(x, y) ((x < y) ? (x) : (y)) #endif -// Each fiber has an associated stack, which is used by -// PUSH, POP and JNZ -typedef struct _RE_FIBER_DATA +typedef struct _RE_FIBER { - int stack_top; + uint8_t* ip; + int32_t sp; uint16_t stack[MAX_RE_STACK]; - struct _RE_FIBER_DATA* next; - struct _RE_FIBER_DATA* prev; - -} RE_FIBER_DATA; - - -// Stacks are allocated as needed, and freed stacks are kept in -// a pool for later re-use. - -typedef struct _RE_FIBER_DATA_POOL -{ - RE_FIBER_DATA* free; - RE_FIBER_DATA* used; - -} RE_FIBER_DATA_POOL; - - -// A fiber is described by its current instruction pointer and -// its fiber_data. - -typedef struct _RE_FIBER -{ - uint8_t* ip; - RE_FIBER_DATA* fiber_data; + struct _RE_FIBER* prev; + struct _RE_FIBER* next; } RE_FIBER; typedef struct _RE_FIBER_LIST { - int count; - RE_FIBER items[MAX_RE_FIBERS]; + RE_FIBER* head; + RE_FIBER* tail; } RE_FIBER_LIST; typedef struct _RE_THREAD_STORAGE { - RE_FIBER_LIST list1; - RE_FIBER_LIST list2; - RE_FIBER_DATA_POOL fiber_data_pool; + RE_FIBER_LIST fiber_pool; } RE_THREAD_STORAGE; @@ -168,8 +142,8 @@ int yr_re_finalize() int yr_re_finalize_thread() { - RE_FIBER_DATA* fiber_data; - RE_FIBER_DATA* next_fiber_data; + RE_FIBER* fiber; + RE_FIBER* next_fiber; RE_THREAD_STORAGE* storage; #ifdef WIN32 @@ -180,13 +154,13 @@ int yr_re_finalize_thread() if (storage != NULL) { - fiber_data = storage->fiber_data_pool.free; + fiber = storage->fiber_pool.head; - while (fiber_data != NULL) + while (fiber != NULL) { - next_fiber_data = fiber_data->next; - yr_free(fiber_data); - fiber_data = next_fiber_data; + next_fiber = fiber->next; + yr_free(fiber); + fiber = next_fiber; } yr_free(storage); @@ -426,12 +400,10 @@ int _yr_re_emit( int split_size; int inst_size; int jmp_size; - int jmp_offset; RE_NODE* left; RE_NODE* right; - uint16_t idx; int16_t* split_offset_addr; int16_t* jmp_offset_addr; uint8_t* instruction_addr; @@ -851,188 +823,174 @@ int yr_re_emit_code( } -RE_FIBER_DATA* _yr_re_alloc_fiber_data( - RE_FIBER_DATA_POOL* pool) +RE_FIBER* _yr_re_fiber_create( + RE_FIBER_LIST* fiber_pool) { - RE_FIBER_DATA* fiber_data; + RE_FIBER* fiber; - if (pool->free != NULL) + if (fiber_pool->head != NULL) { - fiber_data = pool->free; - pool->free = fiber_data->next; - - if (pool->free != NULL) - pool->free->prev = NULL; + fiber = fiber_pool->head; + fiber_pool->head = fiber->next; + if (fiber_pool->tail == fiber) + fiber_pool->tail = NULL; } else { - fiber_data = yr_malloc(sizeof(RE_FIBER_DATA)); + fiber = yr_malloc(sizeof(RE_FIBER)); } - fiber_data->stack_top = -1; - fiber_data->prev = NULL; - - if (pool->used != NULL) - pool->used->prev = fiber_data; - - fiber_data->next = pool->used; - pool->used = fiber_data; + if (fiber != NULL) + { + fiber->ip = NULL; + fiber->sp = -1; + fiber->next = NULL; + fiber->prev = NULL; + } - return fiber_data; + return fiber; } - -RE_FIBER_DATA* _yr_re_clone_fiber_data( - RE_FIBER_DATA* fiber_data, - RE_FIBER_DATA_POOL* pool) +void _yr_re_fiber_append( + RE_FIBER* fiber, + RE_FIBER_LIST* fiber_list) { - RE_FIBER_DATA* clon; - int i; + fiber->prev = fiber_list->tail; - if (fiber_data == NULL) - return NULL; + if (fiber_list->tail != NULL) + fiber_list->tail->next = fiber; - clon = _yr_re_alloc_fiber_data(pool); - clon->stack_top = fiber_data->stack_top; + fiber_list->tail = fiber; - for (i = 0; i < clon->stack_top; i++) - clon->stack[i] = fiber_data->stack[i]; + if (fiber_list->head == NULL) + fiber_list->head = fiber; - return clon; + assert(fiber_list->tail->next == NULL); + assert(fiber_list->head->prev == NULL); } -void _yr_re_free_fiber_data( - RE_FIBER_DATA* fiber_data, - RE_FIBER_DATA_POOL* pool) +RE_FIBER* _yr_re_fiber_split( + RE_FIBER* fiber, + RE_FIBER_LIST* fiber_list, + RE_FIBER_LIST* fiber_pool) { - if (fiber_data == NULL) - return; + RE_FIBER* new_fiber; + int32_t i; - if (pool->used == fiber_data) - pool->used = fiber_data->next; + new_fiber = _yr_re_fiber_create(fiber_pool); - if (fiber_data->prev != NULL) - fiber_data->prev->next = fiber_data->next; + if (new_fiber == NULL) + return NULL; - if (fiber_data->next != NULL) - fiber_data->next->prev = fiber_data->prev; + new_fiber->sp = fiber->sp; + new_fiber->ip = fiber->ip; - fiber_data->next = pool->free; + for (i = 0; i < fiber->sp; i++) + new_fiber->stack[i] = fiber->stack[i]; - if (pool->free != NULL) - pool->free->prev = fiber_data; + new_fiber->next = fiber->next; + new_fiber->prev = fiber; - pool->free = fiber_data; - fiber_data->prev = NULL; -} + if (fiber->next != NULL) + fiber->next->prev = new_fiber; + fiber->next = new_fiber; -int _yr_re_fiber_exists( - RE_FIBER_LIST* fibers, - uint8_t* ip) -{ - int i; + if (fiber_list->tail == fiber) + fiber_list->tail = new_fiber; - for (i = 0; i < fibers->count; i++) - if (fibers->items[i].ip == ip) - return TRUE; + assert(fiber_list->tail->next == NULL); + assert(fiber_list->head->prev == NULL); - return FALSE; + return new_fiber; } -void _yr_re_add_fiber( - RE_FIBER_LIST* fibers, - RE_THREAD_STORAGE* storage, - uint8_t* ip, - uint8_t* input, - RE_FIBER_DATA* fiber_data) +// +// _yr_re_fiber_kill +// +// Kills a given fiber by removing it from the fiber list and putting it +// in the fiber pool. +// + +RE_FIBER* _yr_re_fiber_kill( + RE_FIBER* fiber, + RE_FIBER_LIST* fiber_list, + RE_FIBER_LIST* fiber_pool) { - RE_FIBER_DATA* new_fiber_data; + RE_FIBER* next_fiber = fiber->next; - uint16_t counter_index; - int16_t jmp_offset; + if (fiber->prev != NULL) + fiber->prev->next = next_fiber; - if (_yr_re_fiber_exists(fibers, ip)) - { - _yr_re_free_fiber_data(fiber_data, &storage->fiber_data_pool); - return; - } + if (next_fiber != NULL) + next_fiber->prev = fiber->prev; - switch(*ip) - { - case RE_OPCODE_JUMP: - jmp_offset = *(int16_t*)(ip + 1); - _yr_re_add_fiber(fibers, storage, ip + jmp_offset, input, fiber_data); - break; + if (fiber_pool->tail != NULL) + fiber_pool->tail->next = fiber; - case RE_OPCODE_JNZ: - jmp_offset = *(int16_t*)(ip + 1); - fiber_data->stack[fiber_data->stack_top]--; + if (fiber_list->tail == fiber) + fiber_list->tail = fiber->prev; - if (fiber_data->stack[fiber_data->stack_top] > 0) - _yr_re_add_fiber(fibers, storage, ip + jmp_offset, input, fiber_data); - else - _yr_re_add_fiber(fibers, storage, ip + 3, input, fiber_data); - break; + if (fiber_list->head == fiber) + fiber_list->head = next_fiber; - case RE_OPCODE_PUSH: - if (fiber_data == NULL) - fiber_data = _yr_re_alloc_fiber_data(&storage->fiber_data_pool); + fiber->next = NULL; + fiber->prev = fiber_pool->tail; + fiber_pool->tail = fiber; - fiber_data->stack[++fiber_data->stack_top] = *(uint16_t*)(ip + 1); - _yr_re_add_fiber(fibers, storage, ip + 3, input, fiber_data); - break; + if (fiber_pool->head == NULL) + fiber_pool->head = fiber; - case RE_OPCODE_POP: - fiber_data->stack_top--; - if (fiber_data->stack_top == -1) - { - _yr_re_free_fiber_data(fiber_data, &storage->fiber_data_pool); - fiber_data = NULL; - } - _yr_re_add_fiber(fibers, storage, ip + 1, input, fiber_data); - break; + return next_fiber; +} - case RE_OPCODE_SPLIT_A: - jmp_offset = *(int16_t*)(ip + 1); - new_fiber_data = _yr_re_clone_fiber_data( - fiber_data, &storage->fiber_data_pool); +// +// _yr_re_fiber_kill_tail +// +// Kills all fibers from the given one up to the end of the fiber list. +// - _yr_re_add_fiber( - fibers, storage, ip + 3, input, fiber_data); - _yr_re_add_fiber( - fibers, storage, ip + jmp_offset, input, new_fiber_data); - break; +void _yr_re_fiber_kill_tail( + RE_FIBER* fiber, + RE_FIBER_LIST* fiber_list, + RE_FIBER_LIST* fiber_pool) +{ + RE_FIBER* prev_fiber = fiber->prev; - case RE_OPCODE_SPLIT_B: - jmp_offset = *(int16_t*)(ip + 1); + if (prev_fiber != NULL) + prev_fiber->next = NULL; - new_fiber_data = _yr_re_clone_fiber_data( - fiber_data, &storage->fiber_data_pool); + fiber->prev = fiber_pool->tail; - _yr_re_add_fiber(fibers, storage, ip + jmp_offset, input, fiber_data); - _yr_re_add_fiber(fibers, storage, ip + 3, input, new_fiber_data); - break; + if (fiber_pool->tail != NULL) + fiber_pool->tail->next = fiber; - default: - assert(fibers->count < MAX_RE_FIBERS); - fibers->items[fibers->count].ip = ip; - fibers->items[fibers->count].fiber_data = fiber_data; - fibers->count++; - } + fiber_pool->tail = fiber_list->tail; + fiber_list->tail = prev_fiber; + + if (fiber_list->head == fiber) + fiber_list->head = NULL; + + if (fiber_pool->head == NULL) + fiber_pool->head = fiber; } -#define swap_fibers(x, y) \ - { \ - RE_FIBER_LIST* tmp; \ - tmp = x; \ - x = y; \ - y = tmp; \ - } +#define prolog \ + if (count >= max_count) \ + { \ + fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool); \ + break; \ + } \ + +#define epilog \ + if (match) \ + fiber = fiber->next; \ + else\ + fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool) \ // // yr_re_exec @@ -1060,21 +1018,19 @@ int yr_re_exec( RE_MATCH_CALLBACK_FUNC callback, void* callback_args) { - size_t i; uint8_t* ip; uint8_t* input; uint8_t mask; uint8_t value; + RE_FIBER_LIST fibers; RE_THREAD_STORAGE* storage; - RE_FIBER_LIST* fibers; - RE_FIBER_LIST* next_fibers; - RE_FIBER_DATA* fiber_data; + RE_FIBER* fiber; + RE_FIBER* new_fiber; - int fiber_idx; - int j; + int count; + int max_count; int match; - char character; int character_size; int result = -1; @@ -1091,8 +1047,8 @@ int yr_re_exec( if (storage == NULL) return ERROR_INSUFICIENT_MEMORY; - storage->fiber_data_pool.free = NULL; - storage->fiber_data_pool.used = NULL; + storage->fiber_pool.head = NULL; + storage->fiber_pool.tail = NULL; #ifdef WIN32 TlsSetValue(thread_storage_key, storage); @@ -1101,49 +1057,50 @@ int yr_re_exec( #endif } - input = input_data; - fibers = &storage->list1; - next_fibers = &storage->list2; - if (flags & RE_FLAGS_WIDE) character_size = 2; else character_size = 1; - fibers->count = 0; - next_fibers->count = 0; + fiber = _yr_re_fiber_create(&storage->fiber_pool); + fiber->ip = code; - // Create the initial execution fiber starting at the provided the beginning - // of the provided code. The fiber data is initially NULL and will be created - // dynamically when the first PUSH instruction is found. + fibers.head = fiber; + fibers.tail = fiber; - _yr_re_add_fiber(fibers, storage, code, input, NULL); + input = input_data; + count = 0; + max_count = min(input_size, RE_SCAN_LIMIT); - for (i = 0; i < min(input_size, RE_SCAN_LIMIT); i += character_size) + while (fibers.head != NULL) { - if ((flags & RE_FLAGS_SCAN) && - !(flags & RE_FLAGS_START_ANCHORED)) - _yr_re_add_fiber(fibers, storage, code, input, NULL); - - if (fibers->count == 0) - break; + fiber = fibers.head; - for(fiber_idx = 0; fiber_idx < fibers->count; fiber_idx++) + while(fiber != NULL) { - ip = fibers->items[fiber_idx].ip; - fiber_data = fibers->items[fiber_idx].fiber_data; + ip = fiber->ip; switch(*ip) { case RE_OPCODE_LITERAL: + prolog; if (flags & RE_FLAGS_NO_CASE) match = lowercase[*input] == lowercase[*(ip + 1)]; else match = (*input == *(ip + 1)); - ip += 2; + fiber->ip += 2; + epilog; + break; + + case RE_OPCODE_ANY: + prolog; + match = (*input != 0x0A || flags & RE_FLAGS_DOT_ALL); + fiber->ip += 1; + epilog; break; case RE_OPCODE_MASKED_LITERAL: + prolog; value = *(int16_t*)(ip + 1) & 0xFF; mask = *(int16_t*)(ip + 1) >> 8; @@ -1152,76 +1109,120 @@ int yr_re_exec( // which can't be case-insensitive. match = ((*input & mask) == value); - ip += 3; + fiber->ip += 3; + epilog; break; case RE_OPCODE_CLASS: + prolog; if (flags & RE_FLAGS_NO_CASE) match = CHAR_IN_CLASS(*input, ip + 1) || CHAR_IN_CLASS(altercase[*input], ip + 1); else match = CHAR_IN_CLASS(*input, ip + 1); - ip += 33; + fiber->ip += 33; + epilog; break; case RE_OPCODE_WORD_CHAR: + prolog; match = (isalnum(*input) || *input == '_'); - ip += 1; + fiber->ip += 1; + epilog; break; case RE_OPCODE_NON_WORD_CHAR: + prolog; match = (!isalnum(*input) && *input != '_'); - ip += 1; + fiber->ip += 1; + epilog; break; case RE_OPCODE_SPACE: + prolog; match = (*input == ' ' || *input == '\t'); - ip += 1; + fiber->ip += 1; + epilog; break; case RE_OPCODE_NON_SPACE: + prolog; match = (*input != ' ' && *input != '\t'); - ip += 1; + fiber->ip += 1; + epilog; break; case RE_OPCODE_DIGIT: + prolog; match = isdigit(*input); - ip += 1; + fiber->ip += 1; + epilog; break; case RE_OPCODE_NON_DIGIT: + prolog; match = !isdigit(*input); - ip += 1; + fiber->ip += 1; + epilog; break; - case RE_OPCODE_ANY: - match = (*input != 0x0A || flags & RE_FLAGS_DOT_ALL); - ip += 1; + case RE_OPCODE_SPLIT_A: + new_fiber = _yr_re_fiber_split(fiber, &fibers, &storage->fiber_pool); + new_fiber->ip += *(int16_t*)(ip + 1); + fiber->ip += 3; break; - case RE_OPCODE_MATCH: + case RE_OPCODE_SPLIT_B: + new_fiber = _yr_re_fiber_split(fiber, &fibers, &storage->fiber_pool); + new_fiber->ip += 3; + fiber->ip += *(int16_t*)(ip + 1); + break; + + case RE_OPCODE_JUMP: + fiber->ip = ip + *(int16_t*)(ip + 1); + break; - match = FALSE; + case RE_OPCODE_JNZ: + fiber->stack[fiber->sp]--; + if (fiber->stack[fiber->sp] > 0) + fiber->ip = ip + *(int16_t*)(ip + 1); + else + fiber->ip += 3; + break; - if (flags & RE_FLAGS_END_ANCHORED && i < input_size) + case RE_OPCODE_PUSH: + fiber->stack[++fiber->sp] = *(uint16_t*)(ip + 1); + fiber->ip += 3; + break; + + case RE_OPCODE_POP: + fiber->sp--; + fiber->ip++; + break; + + case RE_OPCODE_MATCH: + + if (flags & RE_FLAGS_END_ANCHORED && count < input_size) + { + fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool); break; + } - result = i; + result = count; if (flags & RE_FLAGS_EXHAUSTIVE) { if (flags & RE_FLAGS_BACKWARDS) - callback(input + character_size, i, flags, callback_args); + callback(input + character_size, count, flags, callback_args); else - callback(input_data, i, flags, callback_args); + callback(input_data, count, flags, callback_args); + + fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool); } else { - // As we are forcing a jump out of the loop fiber_idx - // won't be incremented. Let's do it before exiting. - - fiber_idx++; - goto _exit_loop; + _yr_re_fiber_kill_tail(fiber, &fibers, &storage->fiber_pool); + fiber = NULL; } break; @@ -1229,33 +1230,8 @@ int yr_re_exec( default: assert(FALSE); } - - if (match) - _yr_re_add_fiber( - next_fibers, - storage, - ip, - input + character_size, - fiber_data); - else - _yr_re_free_fiber_data( - fiber_data, - &storage->fiber_data_pool); } - _exit_loop: - - // Free the fiber data for any remaining fiber that didn't - // survived for the next step. - - for(; fiber_idx < fibers->count; fiber_idx++) - _yr_re_free_fiber_data( - fibers->items[fiber_idx].fiber_data, - &storage->fiber_data_pool); - - swap_fibers(fibers, next_fibers); - next_fibers->count = 0; - if (flags & RE_FLAGS_WIDE && *(input + 1) != 0) break; @@ -1264,40 +1240,17 @@ int yr_re_exec( else input += character_size; - } //for (i = 0; i < min(input_size, RE_SCAN_LIMIT) ... + count++; - if (!(flags & RE_FLAGS_END_ANCHORED) || i == input_size) - { - for(fiber_idx = 0; fiber_idx < fibers->count; fiber_idx++) + if ((flags & RE_FLAGS_SCAN) && !(flags & RE_FLAGS_START_ANCHORED) && + count < max_count) { - if (*fibers->items[fiber_idx].ip != RE_OPCODE_MATCH) - continue; - - if (flags & RE_FLAGS_EXHAUSTIVE) - { - if (flags & RE_FLAGS_BACKWARDS) - callback( - input + character_size, i, flags, callback_args); - else - callback( - input_data, i, flags, callback_args); - } - else - { - result = i; - break; - } + fiber = _yr_re_fiber_create(&storage->fiber_pool); + fiber->ip = code; + _yr_re_fiber_append(fiber, &fibers); } } - for(fiber_idx = 0; fiber_idx < fibers->count; fiber_idx++) - _yr_re_free_fiber_data( - fibers->items[fiber_idx].fiber_data, - &storage->fiber_data_pool); - - // Ensure that every fiber data was released - assert(storage->fiber_data_pool.used == NULL); - return result; }