From 5aa44c8fb7cd997ff14d2a1588f10e17de80e3bb Mon Sep 17 00:00:00 2001 From: scanossmining Date: Thu, 8 Feb 2024 14:25:22 +0000 Subject: [PATCH] Add path similarity logic. Update flag 2048. Remove ignore file logic on scan and update ignored extensions --- inc/component.h | 2 + inc/scanoss.h | 2 +- src/file.c | 17 ++-- src/ignored_extensions.c | 8 +- src/ignorelist.c | 7 +- src/match.c | 198 ++++++++++++++++++++++++++++----------- src/match_list.c | 2 +- src/report.c | 1 + src/scan.c | 114 ++++++++++------------ 9 files changed, 221 insertions(+), 130 deletions(-) diff --git a/inc/component.h b/inc/component.h index b5dbe3b..1f9f086 100644 --- a/inc/component.h +++ b/inc/component.h @@ -41,6 +41,8 @@ typedef struct component_data_t char * dependency_text; /* used in json output generation */ char * health_text; /* used in json output generation */ int hits; /*used in binary analysis*/ + char * file_path_ref; + int path_rank; } component_data_t; component_data_t * component_init(void); diff --git a/inc/scanoss.h b/inc/scanoss.h index 86552f1..a33d8b5 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -65,7 +65,7 @@ #define DISABLE_BEST_MATCH 256 #define DISABLE_REPORT_IDENTIFIED 512 #define ENABLE_DOWNLOAD_URL 1024 -#define ENABLE_GITHUB_FULL_PATH 2048 +#define ENABLE_PATH_HINT 2048 #define DISABLE_SERVER_INFO 4096 #define DISABLE_HEALTH 8192 #define ENABLE_HIGH_ACCURACY 16384 diff --git a/src/file.c b/src/file.c index 44fad56..eb84dd4 100644 --- a/src/file.c +++ b/src/file.c @@ -139,27 +139,32 @@ void get_file_md5(char *filepath, uint8_t *md5_result) /* Read file contents into buffer */ FILE *in = fopen(filepath, "rb"); + + if (!in) + { + MD5(NULL, 0, md5_result); + return; + } + fseek(in, 0L, SEEK_END); long filesize = ftell(in); - if (!filesize) { MD5(NULL, 0, md5_result); } - else { /* Read file contents */ fseek(in, 0L, SEEK_SET); uint8_t *buffer = malloc(filesize); - if (!fread(buffer, filesize, 1, in)) fprintf(stderr, "Warning: cannot open file %s\n", filepath); + if (!fread(buffer, filesize, 1, in)) + fprintf(stderr, "Warning: cannot open file %s\n", filepath); /* Calculate MD5sum */ MD5(buffer, filesize, md5_result); - free (buffer); + free(buffer); + fclose(in); } - - fclose(in); } /** diff --git a/src/ignored_extensions.c b/src/ignored_extensions.c index 61c0221..78624fa 100644 --- a/src/ignored_extensions.c +++ b/src/ignored_extensions.c @@ -36,9 +36,9 @@ char *IGNORED_EXTENSIONS[] = { /* File extensions */ ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".ac", ".adoc", ".am", - ".asc", ".asciidoc", ".bmp", ".build", ".cfg", ".chm", ".class", ".cmake", + ".asc", ".asciidoc", ".bmp", ".build", ".cfg", ".chm", ".cmake", ".cnf", ".conf", ".config", ".contributors", ".copying", ".crt", ".csproj", - ".css", ".csv", ".cvsignore", ".dat", ".data", ".db", ".doc", ".ds_store", + ".csv", ".cvsignore", ".dat", ".data", ".db", ".doc", ".ds_store", ".dtd", ".dts", ".dtsi", ".dump", ".eot", ".eps", ".geojson", ".gdoc", ".gif", ".gitignore", ".glif", ".gmo", ".gradle", ".guess", ".hex", ".htm", ".html", ".ico", ".in", ".inc", ".info", ".ini", ".ipynb", ".jpeg", ".jpg", ".json", @@ -49,10 +49,10 @@ char *IGNORED_EXTENSIONS[] = { ".spec", ".sql", ".sub", ".svg", ".svn-base", ".tab", ".template", ".test", ".tex", ".tiff", ".toml", ".ttf", ".txt", ".utf-8", ".vim", ".wav", ".whl", ".woff", ".xht", ".xhtml", ".xls", ".xml", ".xpm", ".xsd", ".xul", ".yaml", - ".yml", ".LAS",".adk",".asc",".cif",".cli",".cosmo",".deploy", + ".yml", ".LAS",".adk",".asc",".cif",".cli",".cosmo",".deploy",".pom", ".dfm",".dmm",".fa",".fasta",".fcb",".flm",".fna",".gbr",".gen",".gro", ".hgtags",".hh",".ihex",".kp",".mpx",".pdb",".poly",".prn",".ps",".ref", - ".resx",".smp",".stg",".tfa",".tsv",".vcf",".vhd",".xy",".xyz", + ".resx",".smp",".stg",".tfa",".tsv",".vcf",".vhd",".xy",".xyz",".xsd" /* File endings */ diff --git a/src/ignorelist.c b/src/ignorelist.c index 61ad1be..2fdf920 100644 --- a/src/ignorelist.c +++ b/src/ignorelist.c @@ -35,6 +35,7 @@ #include "ignorelist.h" #include "ignored_extensions.h" +#include "debug.h" /** * @brief Returns a pointer to the file extension of "path" @@ -100,7 +101,11 @@ bool ignored_extension(char *name) { int i=0; while (IGNORED_EXTENSIONS[i]) - if (ends_with(IGNORED_EXTENSIONS[i++], name)) return true; + if (ends_with(IGNORED_EXTENSIONS[i++], name)) + { + scanlog("Component ignored by path extension: %s", name); + return true; + } return false; } diff --git a/src/match.c b/src/match.c index abf44e6..5811e11 100644 --- a/src/match.c +++ b/src/match.c @@ -131,63 +131,133 @@ static int hint_eval(component_data_t *a, component_data_t *b) return 0; } +// own function to compare strings and rate the similarity +static int string_compare(const char *string1, const char *string2) +{ + float difference = 0; + float div = (strlen(string1) + strlen(string2)) / 2; + if (!strcmp(string1, string2)) + return -1; + + while (*string1 != '\0' && *string2 != '\0') + { + if (*string1 != *string2) + { + difference += 1 / div; + } + + string1++; + string2++; + } + + // Add the length difference if the strings have different lengths + difference += abs((int)strlen(string1) - (int)strlen(string2)) / div; + + return ceil(difference); +} + +static bool look_for_version(char *in) +{ + if (!in) + return false; + char *v = strstr(in, "-v"); + if (v && isdigit(*(v + 1))) + return true; + + v = strchr(in, '-'); + if (v && isdigit(*(v + 1)) && isdigit(*(v + 2))) + return true; + + return false; +} + +#define PATH_LEVEL_COMP_INIT_VALUE 1000 +#define PATH_LEVEL_COMP_REF 10 // Function to compare the similarity of two paths from back to front -int comparePaths(char *a, char *b) { - // Pointers to traverse the paths from the end - char *ptr_a = strrchr(a, '/'); - char *ptr_b = strrchr(b, '/'); +static int paths_compare(const char *a, const char *b) +{ + // Pointers to traverse the paths from the end + const char *ptr_a = strrchr(a, '/'); + if (!ptr_a) + ptr_a = a; + + const char *ptr_b = strrchr(b, '/'); + if (!ptr_b) + ptr_b = b; + + const char *ptr_a_prev = a + strlen(a) - 1; + const char *ptr_b_prev = b + strlen(b) - 1; - char * ptr_a_prev = a + strlen(a) - 1; - char * ptr_b_prev = b + strlen(b) - 1; + int rank = PATH_LEVEL_COMP_REF; + // check if both path have equal lenght + if (strlen(a) == strlen(b)) + rank--; - // Variables to count level matches - int match = 0; while (ptr_a >= a && ptr_b >= b) { - char * level_a = strndup(ptr_a, ptr_a_prev - ptr_a); - char * level_b = strndup(ptr_b, ptr_b_prev - ptr_b); - - printf("%s / %s\n", ptr_a, ptr_b); + // Look for each path level + if ((*ptr_a == '/' || ptr_a == a) && (*ptr_b == '/' || ptr_b == b)) + { + size_t size_a = ptr_a_prev - ptr_a; + size_t size_b = ptr_b_prev - ptr_b; - // Compare the current levels - int comparison = strcmp(level_a, level_b); + char *level_a = strndup(ptr_a, size_a); + char *level_b = strndup(ptr_b, size_b); - printf("A: %s - B: %s - rank: %d\n", level_a, level_b, comparison); - free(level_a); - free(level_b); + // Compare the current levels - the level will be ignored if it has version information inside. + if (!(look_for_version(level_a) || look_for_version(level_b))) + rank += string_compare(*level_a == '/' ? level_a + 1 : level_a, *level_b == '/' ? level_b + 1 : level_b); - ptr_a_prev = ptr_a; - ptr_b_prev = ptr_b; - ptr_a--; - ptr_b--; - ptr_a = strrchr(ptr_a, '/'); - if (!ptr_a) - ptr_a = a; - ptr_b = strrchr(ptr_b, '/'); - if (!ptr_b) - ptr_b = b; + free(level_a); + free(level_b); + // Move pointers + if (ptr_a > a) + { + ptr_a_prev = ptr_a; + ptr_a--; + } + if (ptr_b > b) + { + ptr_b_prev = ptr_b; + ptr_b--; + } + rank--; + } + if (ptr_a == a && ptr_b == b) + break; - if (comparison == 0) - { - match++; - } /*else { - // Adjust the match if levels do not match exactly - // You can adjust these values according to your needs - match += 1 - (float)comparison / 128; - }*/ + // look for the next levels + if (!(*ptr_a == '/' || ptr_a == a)) + ptr_a--; + if (!(*ptr_b == '/' || ptr_b == b)) + ptr_b--; } - printf("---\n"); - - // Compare each level of the paths from back to front - // Calculate the similarity index - //int index = (match * 100) / ((lenA > lenB) ? lenA : lenB); + return rank; +} - return match; +static void evaluate_path_rank(component_data_t *comp) +{ + if (comp->path_rank == PATH_LEVEL_COMP_INIT_VALUE) + { + //generate the rank based on the similarity of the paths. + comp->path_rank = paths_compare(comp->file_path_ref, comp->file); + //modulate the result based on component information- + if (comp->path_rank < PATH_LEVEL_COMP_REF && (strstr(comp->file_path_ref, comp->component) || strstr(comp->file_path_ref, comp->vendor))) + { + comp->path_rank -= PATH_LEVEL_COMP_REF / 5 + 1; + if (strstr(comp->file_path_ref, comp->component) && strstr(comp->file_path_ref, comp->vendor)) + { + comp->path_rank -= PATH_LEVEL_COMP_REF / 2; + } + if (strstr(comp->purls[0], "github")) + comp->path_rank--; + } + } } /** @@ -199,7 +269,6 @@ int comparePaths(char *a, char *b) { * @return false "a" wins, compare with the next component. */ -static char * file_path = NULL; static bool component_hint_date_comparation(component_data_t *a, component_data_t *b) { if (declared_components) @@ -220,19 +289,37 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ else if (component_hint) { + scanlog("hint eval\n"); int result = hint_eval(a,b); if (result > 0) return true; if (result < 0) return false; } - if (file_path) + + if ((engine_flags & ENABLE_PATH_HINT) && a->file_path_ref && b->file_path_ref) { - int a_cmp = comparePaths(file_path, a->file); - int b_cmp = comparePaths(file_path, b->file); - if (b_cmp - a_cmp > 1) - return true; - if (b_cmp - a_cmp < 1) + //evalute path rank for component a + evaluate_path_rank(a); + + //evalute path rank for component b + evaluate_path_rank(b); + + //The path_rank will be used as hint only when it has a reasonable value, in other cases the critea will be ignored. + if (b->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + { + if (b->path_rank - a->path_rank < 0) + { + scanlog("%s wins %s by path rank %d\n", b->purls[0], a->purls[0], b->path_rank); + return true; + } + if (b->path_rank - a->path_rank > 0) + { + scanlog("%s - %s loses %s by path rank %d/%d\n", b->purls[0],b->file, a->purls[0], b->path_rank, a->path_rank); + return false; + } + } + else if (a->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) return false; } if (!*b->release_date) @@ -291,12 +378,17 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, /* The oldest component will be the first in the list, if two components have the same age the purl date will untie */ new_comp->identified = IDENTIFIED_NONE; asset_declared(new_comp); - file_path = component_list->match_ref->scan_ower->file_path; + new_comp->file_path_ref = component_list->match_ref->scan_ower->file_path; + new_comp->path_rank = PATH_LEVEL_COMP_INIT_VALUE; + scanlog("--- new comp ---\n"); if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) { - scanlog("component rejected by date: %s\n", new_comp->purls[0]); + scanlog("component rejected: %s\n", new_comp->purls[0]); component_data_free(new_comp); /* Free if the componet was rejected */ } + else + scanlog("component accepted: %s - pathrank: %d\n", new_comp->purls[0], new_comp->path_rank); + } else { @@ -353,12 +445,11 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * memcpy(url_id, raw_data, MD5_LEN); char path[MAX_FILE_PATH+1]; strncpy(path, decrypted, MAX_FILE_PATH); - if (!ignored_extension(path)) + //check the ignore list only if the match type is MATCH_SNIPPET. TODO: remove this after remine everything. + if (!(component_list->match_ref->type == MATCH_SNIPPET && !ignored_extension(path))) add_component_from_urlid(component_list, url_id, path); free(decrypted); - - //scanlog("#%d File %s\n", iteration, files[iteration].path); return false; } @@ -445,6 +536,7 @@ bool load_matches(match_data_t *match) /*if the next component has dependencies, permute */ else if (print_dependencies(item->entries.le_next->component)) { + scanlog("permute due to dependencies\n"); struct comp_entry *aux = item->entries.le_next->entries.le_next; LIST_INSERT_HEAD(&match->component_list.headp, item->entries.le_next, entries); item->entries.le_next = aux; diff --git a/src/match_list.c b/src/match_list.c index e39f52d..928c74c 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -80,7 +80,7 @@ bool component_list_add(component_list_t *list, component_data_t *new_comp, bool if (!list->headp.lh_first) { - scanlog("first component in list\n"); + scanlog("first component in list %s\n", new_comp->purls[0]); struct comp_entry *nn = calloc(1, sizeof(struct comp_entry)); /* Insert at the head. */ LIST_INSERT_HEAD(&list->headp, nn, entries); nn->component = new_comp; diff --git a/src/report.c b/src/report.c index 5115b85..0d1746b 100644 --- a/src/report.c +++ b/src/report.c @@ -242,6 +242,7 @@ bool print_json_component(component_data_t * component) printf("\"release_date\": \"%s\",", component->release_date); printf("\"file\": \"%s\",", component->url_match == true ? basename(component->url) : component->file); + printf("\"path_rank\": %d,", component->path_rank); char *url_id = md5_hex(component->url_md5); printf("\"url_hash\": \"%s\"", url_id); diff --git a/src/scan.c b/src/scan.c index 0a4ac23..6005aad 100644 --- a/src/scan.c +++ b/src/scan.c @@ -408,19 +408,13 @@ void output_matches_json(scan_data_t *scan) * * @param scan */ -void ldb_scan(scan_data_t * scan) +void ldb_scan(scan_data_t *scan) { - bool skip = false; if (!scan) return; - if (unwanted_path(scan->file_path)) - { - skip = true; - scanlog("File %s skipped by path", scan->file_path); - } /* LDB must be available to proceed with the scan*/ - if (!ldb_table_exists(oss_file.db, oss_file.table) || !ldb_table_exists(oss_url.db, oss_url.table)) + if (!ldb_table_exists(oss_file.db, oss_file.table) || !ldb_table_exists(oss_url.db, oss_url.table)) { printf("Error: file and url tables must be present in %s KB in order to proceed with the scan\n", oss_file.db); free(scan); @@ -433,74 +427,66 @@ void ldb_scan(scan_data_t * scan) /* Get file length */ uint64_t file_size = 0; - if (!skip) - { - if (scan->preload) file_size = atoi(scan->file_size); - else file_size = get_file_size(scan->file_path); - if (file_size < 0) ldb_error("Cannot access file"); - } + + if (scan->preload) + file_size = atoi(scan->file_size); + else + file_size = get_file_size(scan->file_path); + + if (file_size < 0) + ldb_error("Cannot access file"); /* Calculate MD5 hash (if not already preloaded) */ - if (!skip) if (!scan->preload) get_file_md5(scan->file_path, scan->md5); + if (!scan->preload) + get_file_md5(scan->file_path, scan->md5); - if (!skip && extension(scan->file_path) && ignored_extension(scan->file_path)) - { - skip = true; - scanlog("File %s skipped by extension", scan->file_path); - } + /* Scan full file */ + char *tmp_md5_hex = md5_hex(scan->md5); + strcpy(scan->source_md5, tmp_md5_hex); + free(tmp_md5_hex); - /* Ignore <=1 byte */ - if (file_size <= MIN_FILE_SIZE) - { - skip = true; - scanlog("File %s skipped by file size < %d\n", scan->file_path, MIN_FILE_SIZE); - } + /* Look for full file match or url match in ldb */ + scan->match_type = ldb_scan_file(scan); - if (!skip) + /* If no match, scan snippets */ + if (scan->match_type == MATCH_NONE || force_snippet_scan) { - /* Scan full file */ - char *tmp_md5_hex = md5_hex(scan->md5); - strcpy(scan->source_md5, tmp_md5_hex); - free(tmp_md5_hex); - - /* Look for full file match or url match in ldb */ - scan->match_type = ldb_scan_file(scan); - - /* If no match, scan snippets */ - if (scan->match_type == MATCH_NONE || force_snippet_scan) + /* Load snippets into scan data */ + if (!scan->preload) { - /* Load snippets into scan data */ - if (!scan->preload) + /* Read file into memory */ + char *src = calloc(MAX_FILE_SIZE, 1); + if (file_size < MAX_FILE_SIZE) + read_file(src, scan->file_path, 0); + + /* If HPSM is enable calculate the crc8 line hash calling the shared lib */ + if (hpsm_enabled) { - /* Read file into memory */ - char *src = calloc(MAX_FILE_SIZE, 1); - if (file_size < MAX_FILE_SIZE) read_file(src, scan->file_path, 0); - - /* If HPSM is enable calculate the crc8 line hash calling the shared lib */ - if(hpsm_enabled) + char *aux = hpsm_hash_file_contents(src); + if (aux) { - char *aux = hpsm_hash_file_contents(src); - if(aux) - { - hpsm_crc_lines = strdup(&aux[5]); - free(aux); - } - } - /* Determine if file is to skip snippet search */ - if (!skip_snippets(src, file_size)) - { /* Load wfps into scan structure */ - scan->hash_count = winnowing(src, scan->hashes, scan->lines, MAX_FILE_SIZE); - if (scan->hash_count) scan->total_lines = scan->lines[scan->hash_count - 1]; + hpsm_crc_lines = strdup(&aux[5]); + free(aux); } - free(src); } - else if (scan->hash_count) scan->total_lines = scan->lines[scan->hash_count - 1]; + /* Determine if file is to skip snippet search */ + if (!skip_snippets(src, file_size)) + { /* Load wfps into scan structure */ + scan->hash_count = winnowing(src, scan->hashes, scan->lines, MAX_FILE_SIZE); + if (scan->hash_count) + scan->total_lines = scan->lines[scan->hash_count - 1]; + } + free(src); + } + else if (scan->hash_count) + scan->total_lines = scan->lines[scan->hash_count - 1]; - /* Perform snippet scan */ - if (scan->total_lines) scan->match_type = ldb_scan_snippets(scan); + /* Perform snippet scan */ + if (scan->total_lines) + scan->match_type = ldb_scan_snippets(scan); - else scanlog("File skipped\n"); - } + else + scanlog("File skipped\n"); } /* Compile matches */ @@ -508,7 +494,7 @@ void ldb_scan(scan_data_t * scan) if (!scan->best_match) scanlog("No best match\n"); - + /* Output matches */ scanlog("Match output starts\n"); if (!quiet)