From 5d21f846c3bd1a7c91c4cf2ac57d3299dfcc2f67 Mon Sep 17 00:00:00 2001 From: Mariano Scasso <75589700+mscasso-scanoss@users.noreply.github.com> Date: Tue, 27 Feb 2024 09:45:23 -0300 Subject: [PATCH] 5.4.0 (#64) * Add path similarity logic. Update flag 2048. * Remove ignore file logic on scan and update ignored extensions. * add lines coverage to snippet analysis. * Update Makefile, add live ldb version check * update help * solve minor bug with hints and dependencies tiebreak. * improve memory management for failed scans. * Solve memory segfault processing sbom. --- Makefile | 11 +- inc/component.h | 2 + inc/scanoss.h | 4 +- src/binary_scan.c | 9 +- src/component.c | 28 ++++- src/file.c | 17 +-- src/help.c | 2 +- src/ignored_extensions.c | 6 +- src/ignorelist.c | 7 +- src/main.c | 134 ++++++++++-------------- src/match.c | 220 ++++++++++++++++++++++++++++++++------- src/match_list.c | 2 +- src/mz.c | 65 ------------ src/report.c | 22 +--- src/scan.c | 116 +++++++++------------ src/snippets.c | 43 ++++++-- 16 files changed, 394 insertions(+), 294 deletions(-) diff --git a/Makefile b/Makefile index 101aec7..6b8e097 100644 --- a/Makefile +++ b/Makefile @@ -5,12 +5,9 @@ endif LDFLAGS+= -lldb -lm -lpthread -ldl LDB_CURRENT_VERSION := $(shell ldb -v | sed 's/ldb-//' | head -c 3) -LDB_TARGET_VERSION := 3.2 +LDB_TARGET_VERSION := 4.1 VERSION_IS_LESS := $(shell echo $(LDB_CURRENT_VERSION) \< $(LDB_TARGET_VERSION) | bc) -ifeq ($(VERSION_IS_LESS),1) - LDFLAGS += -lcrypto -lz -endif CCFLAGS ?= -O -lz -Wall -Wno-unused-result -Wno-deprecated-declarations -g -Iinc -Iexternal/inc -D_LARGEFILE64_SOURCE -D_GNU_SOURCE SOURCES=$(wildcard src/*.c) $(wildcard src/**/*.c) $(wildcard external/*.c) $(wildcard external/**/*.c) @@ -20,8 +17,10 @@ TARGET=scanoss # Regla de prueba $(TARGET): $(OBJECTS) - @echo "Current version: $(LDB_CURRENT_VERSION)" - @echo "LDFLAGS: $(LDFLAGS)" +ifeq ($(VERSION_IS_LESS),1) + @echo "Current LDB version: $(LDB_CURRENT_VERSION) is too old, please update to the lastest version to continue." + exit 1 +endif $(CC) -g -o $(TARGET) $^ $(LDFLAGS) diff --git a/inc/component.h b/inc/component.h index b5dbe3b..1f9f086 100644 --- a/inc/component.h +++ b/inc/component.h @@ -41,6 +41,8 @@ typedef struct component_data_t char * dependency_text; /* used in json output generation */ char * health_text; /* used in json output generation */ int hits; /*used in binary analysis*/ + char * file_path_ref; + int path_rank; } component_data_t; component_data_t * component_init(void); diff --git a/inc/scanoss.h b/inc/scanoss.h index 86552f1..deb9346 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -44,7 +44,7 @@ #define WFP_REC_LN 18 /* Log files */ -#define SCANOSS_VERSION "5.3.5" +#define SCANOSS_VERSION "5.4.0" #define SCAN_LOG "/tmp/scanoss_scan.log" #define MAP_DUMP "/tmp/scanoss_map.dump" #define SLOW_QUERY_LOG "/tmp/scanoss_slow_query.log" @@ -65,7 +65,7 @@ #define DISABLE_BEST_MATCH 256 #define DISABLE_REPORT_IDENTIFIED 512 #define ENABLE_DOWNLOAD_URL 1024 -#define ENABLE_GITHUB_FULL_PATH 2048 +#define ENABLE_PATH_HINT 2048 #define DISABLE_SERVER_INFO 4096 #define DISABLE_HEALTH 8192 #define ENABLE_HIGH_ACCURACY 16384 diff --git a/src/binary_scan.c b/src/binary_scan.c index a8c2ce7..bda72e6 100644 --- a/src/binary_scan.c +++ b/src/binary_scan.c @@ -161,7 +161,11 @@ static bool get_all_file_ids(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8 static void fhash_process(char * hash, component_list_t * comp_list) { - struct ldb_table oss_fhash = {.db = "oss", .table = "fhashes", .key_ln = 16, .rec_ln = 0, .ts_ln = 2, .tmp = false}; + struct ldb_table oss_fhash = {.db = "oss", .table = "fhashes", .key_ln = 16, .rec_ln = 0, .ts_ln = 2, .tmp = false, .keys=2, .definitions = 0}; + + if (!ldb_table_exists(oss_fhash.db, oss_fhash.table)) // skip if the table is not present + return; + uint8_t fhash[16]; ldb_hex_to_bin(hash, 32, fhash); /* Get all file IDs for given wfp */ @@ -304,7 +308,10 @@ int binary_scan(char * input) break; component_list_destroy(result.components); free(result.file); + result.file = NULL; free(result.md5); + result.md5 = NULL; + sensibility++; }; diff --git a/src/component.c b/src/component.c index 76093c8..d15ff30 100644 --- a/src/component.c +++ b/src/component.c @@ -177,6 +177,32 @@ bool ignored_asset_match(uint8_t *url_record) return found; } +static char * look_for_version(char *in) +{ + if (!in) + return NULL; + bool is_ver = false; + + char *v = strstr(in, "-v"); + if (v && isdigit(*(v + 2))) + is_ver = true; + else + { + v = strchr(in, '.'); + if (v && isdigit(*(v + 1)) && (*(v + 2) == '.' || isdigit(*(v + 2)))) + is_ver = true; + } + + if (is_ver) + { + char * p = strchr(v, '/'); + if (p) + return (p+1); + } + + return in; +} + /** * @brief Fill the match structure * @param url_key md5 of the match url @@ -203,7 +229,7 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa memcpy(component->url_md5, url_key, MD5_LEN); if (file_path) { - component->file = strdup(file_path); + component->file = strdup(look_for_version(file_path)); component->path_ln = strlen(file_path); flip_slashes(component->file); } diff --git a/src/file.c b/src/file.c index 44fad56..eb84dd4 100644 --- a/src/file.c +++ b/src/file.c @@ -139,27 +139,32 @@ void get_file_md5(char *filepath, uint8_t *md5_result) /* Read file contents into buffer */ FILE *in = fopen(filepath, "rb"); + + if (!in) + { + MD5(NULL, 0, md5_result); + return; + } + fseek(in, 0L, SEEK_END); long filesize = ftell(in); - if (!filesize) { MD5(NULL, 0, md5_result); } - else { /* Read file contents */ fseek(in, 0L, SEEK_SET); uint8_t *buffer = malloc(filesize); - if (!fread(buffer, filesize, 1, in)) fprintf(stderr, "Warning: cannot open file %s\n", filepath); + if (!fread(buffer, filesize, 1, in)) + fprintf(stderr, "Warning: cannot open file %s\n", filepath); /* Calculate MD5sum */ MD5(buffer, filesize, md5_result); - free (buffer); + free(buffer); + fclose(in); } - - fclose(in); } /** diff --git a/src/help.c b/src/help.c index d06ffad..4def460 100644 --- a/src/help.c +++ b/src/help.c @@ -88,7 +88,7 @@ Alternatively, these value can be written in %s\n\ | 256 | Disable best match only (default: enabled) |\n\ | 512 | Hide identified files (default: disabled) |\n\ | 1024 | Enable download_url (default: disabled) |\n\ -| 2048 | Enable GitHub full path (default: disabled) |\n\ +| 2048 | Enable \"use path hint\" logic (default: disabled) |\n\ | 4096 | Disable extended server stats (default: enabled) |\n\ | 8192 | Disable health layer (default: enabled) |\n\ | 16384 | Enable high accuracy, slower scan (default: disabled) |\n\ diff --git a/src/ignored_extensions.c b/src/ignored_extensions.c index 61c0221..375a0eb 100644 --- a/src/ignored_extensions.c +++ b/src/ignored_extensions.c @@ -36,9 +36,9 @@ char *IGNORED_EXTENSIONS[] = { /* File extensions */ ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".ac", ".adoc", ".am", - ".asc", ".asciidoc", ".bmp", ".build", ".cfg", ".chm", ".class", ".cmake", + ".asc", ".asciidoc", ".bmp", ".build", ".cfg", ".chm", ".cmake", ".cnf", ".conf", ".config", ".contributors", ".copying", ".crt", ".csproj", - ".css", ".csv", ".cvsignore", ".dat", ".data", ".db", ".doc", ".ds_store", + ".csv", ".cvsignore", ".dat", ".data", ".db", ".doc", ".ds_store", ".dtd", ".dts", ".dtsi", ".dump", ".eot", ".eps", ".geojson", ".gdoc", ".gif", ".gitignore", ".glif", ".gmo", ".gradle", ".guess", ".hex", ".htm", ".html", ".ico", ".in", ".inc", ".info", ".ini", ".ipynb", ".jpeg", ".jpg", ".json", @@ -49,7 +49,7 @@ char *IGNORED_EXTENSIONS[] = { ".spec", ".sql", ".sub", ".svg", ".svn-base", ".tab", ".template", ".test", ".tex", ".tiff", ".toml", ".ttf", ".txt", ".utf-8", ".vim", ".wav", ".whl", ".woff", ".xht", ".xhtml", ".xls", ".xml", ".xpm", ".xsd", ".xul", ".yaml", - ".yml", ".LAS",".adk",".asc",".cif",".cli",".cosmo",".deploy", + ".yml", ".LAS",".adk",".asc",".cif",".cli",".cosmo",".deploy",".pom", ".dfm",".dmm",".fa",".fasta",".fcb",".flm",".fna",".gbr",".gen",".gro", ".hgtags",".hh",".ihex",".kp",".mpx",".pdb",".poly",".prn",".ps",".ref", ".resx",".smp",".stg",".tfa",".tsv",".vcf",".vhd",".xy",".xyz", diff --git a/src/ignorelist.c b/src/ignorelist.c index 61ad1be..2fdf920 100644 --- a/src/ignorelist.c +++ b/src/ignorelist.c @@ -35,6 +35,7 @@ #include "ignorelist.h" #include "ignored_extensions.h" +#include "debug.h" /** * @brief Returns a pointer to the file extension of "path" @@ -100,7 +101,11 @@ bool ignored_extension(char *name) { int i=0; while (IGNORED_EXTENSIONS[i]) - if (ends_with(IGNORED_EXTENSIONS[i++], name)) return true; + if (ends_with(IGNORED_EXTENSIONS[i++], name)) + { + scanlog("Component ignored by path extension: %s", name); + return true; + } return false; } diff --git a/src/main.c b/src/main.c index fd2f498..1043df0 100644 --- a/src/main.c +++ b/src/main.c @@ -65,91 +65,61 @@ component_item *declared_components; uint8_t trace_id[MD5_LEN]; bool trace_on; - +#define LDB_VER_MIN "4.1.0" /* Initialize tables for the DB name indicated (defaults to oss) */ void initialize_ldb_tables(char *name) { + + char * ldb_ver = NULL; + ldb_version(&ldb_ver); + scanlog("ldb version: %s\n", ldb_ver); + + if (!ldb_ver || strcmp(ldb_ver, LDB_VER_MIN) < 0) + { + fprintf(stderr, "The current ldb version %s is too old, please upgrade to %s to proceed\n", ldb_ver, LDB_VER_MIN); + exit(EXIT_FAILURE); + } + free(ldb_ver); + char oss_db_name[MAX_ARGLN]; if (name) strcpy(oss_db_name, name); else strcpy(oss_db_name, DEFAULT_OSS_DB_NAME); - strcpy(oss_url.db, oss_db_name); - strcpy(oss_url.table, "url"); - oss_url.key_ln = 16; - oss_url.rec_ln = 0; - oss_url.ts_ln = 2; - oss_url.tmp = false; - - strcpy(oss_file.db, oss_db_name); - strcpy(oss_file.table, "file"); - oss_file.key_ln = 16; - oss_file.rec_ln = 0; - oss_file.ts_ln = 2; - oss_file.tmp = false; - - strcpy(oss_wfp.db, oss_db_name); - strcpy(oss_wfp.table, "wfp"); - oss_wfp.key_ln = 4; - oss_wfp.rec_ln = 18; - oss_wfp.ts_ln = 2; - oss_wfp.tmp = false; - - strcpy(oss_purl.db, oss_db_name); - strcpy(oss_purl.table, "purl"); - oss_purl.key_ln = 16; - oss_purl.rec_ln = 0; - oss_purl.ts_ln = 2; - oss_purl.tmp = false; - - strcpy(oss_copyright.db, oss_db_name); - strcpy(oss_copyright.table, "copyright"); - oss_copyright.key_ln = 16; - oss_copyright.rec_ln = 0; - oss_copyright.ts_ln = 2; - oss_copyright.tmp = false; - - strcpy(oss_quality.db, oss_db_name); - strcpy(oss_quality.table, "quality"); - oss_quality.key_ln = 16; - oss_quality.rec_ln = 0; - oss_quality.ts_ln = 2; - oss_quality.tmp = false; - - strcpy(oss_vulnerability.db, oss_db_name); - strcpy(oss_vulnerability.table, "vulnerability"); - oss_vulnerability.key_ln = 16; - oss_vulnerability.rec_ln = 0; - oss_vulnerability.ts_ln = 2; - oss_vulnerability.tmp = false; - - strcpy(oss_dependency.db, oss_db_name); - strcpy(oss_dependency.table, "dependency"); - oss_dependency.key_ln = 16; - oss_dependency.rec_ln = 0; - oss_dependency.ts_ln = 2; - oss_dependency.tmp = false; - - strcpy(oss_license.db, oss_db_name); - strcpy(oss_license.table, "license"); - oss_license.key_ln = 16; - oss_license.rec_ln = 0; - oss_license.ts_ln = 2; - oss_license.tmp = false; - - strcpy(oss_attribution.db, oss_db_name); - strcpy(oss_attribution.table, "attribution"); - oss_attribution.key_ln = 16; - oss_attribution.rec_ln = 0; - oss_attribution.ts_ln = 2; - oss_attribution.tmp = false; - - strcpy(oss_cryptography.db, oss_db_name); - strcpy(oss_cryptography.table, "cryptography"); - oss_cryptography.key_ln = 16; - oss_cryptography.rec_ln = 0; - oss_cryptography.ts_ln = 2; - oss_cryptography.tmp = false; + char dbtable[MAX_ARGLN * 2]; + scanlog("Loading tables definitions\n"); + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "url"); + oss_url = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "file"); + oss_file = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "wfp"); + oss_wfp = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "purl"); + oss_purl = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "copyright"); + oss_copyright = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "quality"); + oss_quality = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "vulnerability"); + oss_vulnerability = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "dependency"); + oss_dependency = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "license"); + oss_license = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "attribution"); + oss_attribution = ldb_read_cfg(dbtable); + + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "cryptography"); + oss_cryptography = ldb_read_cfg(dbtable); kb_version_get(); osadl_load_file(); @@ -304,12 +274,10 @@ int main(int argc, char **argv) microseconds_start = microseconds_now(); - initialize_ldb_tables(NULL); - /* Parse arguments */ int option; bool invalid_argument = false; - + char * ldb_db_name = NULL; while ((option = getopt(argc, argv, ":f:s:b:B:c:k:a:F:l:n:i:M:N:wtvhedqH")) != -1) { /* Check valid alpha is entered */ @@ -339,6 +307,7 @@ int main(int argc, char **argv) break; case 'k': + initialize_ldb_tables(ldb_db_name); mz_file_contents(optarg, oss_file.db); exit(EXIT_SUCCESS); break; @@ -359,7 +328,7 @@ int main(int argc, char **argv) break; case 'n': - initialize_ldb_tables(optarg); + ldb_db_name = strdup(optarg); break; case 'M': scan_max_snippets = atol(optarg); @@ -475,6 +444,9 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } + initialize_ldb_tables(ldb_db_name); + free(ldb_db_name); + /* Remove trailing backslashes from target (if any) */ strcpy (target, argv[argc-1]); for (int i=strlen(target)-1; i>=0; i--) if (target[i]=='/') target[i]=0; else break; diff --git a/src/match.c b/src/match.c index ee63e80..680357f 100644 --- a/src/match.c +++ b/src/match.c @@ -107,12 +107,13 @@ static int hint_eval(component_data_t *a, component_data_t *b) /*Check for component hint in purl, select components matching with the hint */ if (a->purls[0] && strstr(a->purls[0], component_hint) && !(b->purls[0] && strstr(b->purls[0], component_hint))) { - scanlog("Reject component %s by hint: %s\n", b->purls[0], component_hint); + scanlog("Reject component %s by purl hint: %s\n", b->purls[0], component_hint); return -1; } if (b->purls[0] && strstr(b->purls[0], component_hint) && !(a->purls[0] && strstr(a->purls[0], component_hint))) { - scanlog("Accept component %s by hint: %s\n", b->purls[0], component_hint); + scanlog("Accept component %s by purl hint: %s\n", b->purls[0], component_hint); + b->identified = 1; return 1; } @@ -125,12 +126,127 @@ static int hint_eval(component_data_t *a, component_data_t *b) if (b->component && strstr(b->component, component_hint) && !(a->component && strstr(a->purls[0], component_hint))) { scanlog("Accept component %s by hint: %s\n", b->component, component_hint); + b->identified = 1; return 1; } return 0; } +// own function to compare strings and rate the similarity +static int string_compare(const char *string1, const char *string2) +{ + float difference = 0; + float div = (strlen(string1) + strlen(string2)) / 2; + if (!strcmp(string1, string2)) + return -1; + + while (*string1 != '\0' && *string2 != '\0') + { + if (*string1 != *string2) + { + difference += 1 / div; + } + + string1++; + string2++; + } + + // Add the length difference if the strings have different lengths + difference += abs((int)strlen(string1) - (int)strlen(string2)) / div; + + return ceil(difference); +} + +#define PATH_LEVEL_COMP_INIT_VALUE 1000 +#define PATH_LEVEL_COMP_REF 10 +// Function to compare the similarity of two paths from back to front +static int paths_compare(const char *a, const char *b) +{ + // Pointers to traverse the paths from the end + const char *ptr_a = strrchr(a, '/'); + if (!ptr_a) + ptr_a = a; + + const char *ptr_b = strrchr(b, '/'); + if (!ptr_b) + ptr_b = b; + + const char *ptr_a_prev = a + strlen(a) - 1; + const char *ptr_b_prev = b + strlen(b) - 1; + + int rank = PATH_LEVEL_COMP_REF; + // check if both path have equal lenght + if (strlen(a) == strlen(b)) + rank--; + + while (ptr_a >= a && ptr_b >= b) + { + // Look for each path level + if ((*ptr_a == '/' || ptr_a == a) && (*ptr_b == '/' || ptr_b == b)) + { + size_t size_a = ptr_a_prev - ptr_a; + size_t size_b = ptr_b_prev - ptr_b; + + char *level_a = strndup(ptr_a, size_a); + char *level_b = strndup(ptr_b, size_b); + + // Compare the current levels + rank += string_compare(*level_a == '/' ? level_a + 1 : level_a, *level_b == '/' ? level_b + 1 : level_b); + + free(level_a); + free(level_b); + + // Move pointers + if (ptr_a > a) + { + ptr_a_prev = ptr_a; + ptr_a--; + } + if (ptr_b > b) + { + ptr_b_prev = ptr_b; + ptr_b--; + } + rank--; + } + + if (ptr_a == a && ptr_b == b) + break; + + // look for the next levels + if (!(*ptr_a == '/' || ptr_a == a)) + ptr_a--; + + if (!(*ptr_b == '/' || ptr_b == b)) + ptr_b--; + } + + return rank; +} + +static void evaluate_path_rank(component_data_t *comp) +{ + if (comp->path_rank == PATH_LEVEL_COMP_INIT_VALUE) + { + //generate the rank based on the similarity of the paths. + comp->path_rank = paths_compare(comp->file_path_ref, comp->file); + + //modulate the result based on component information- + if (comp->path_rank < PATH_LEVEL_COMP_REF && (strstr(comp->file_path_ref, comp->component) || strstr(comp->file_path_ref, comp->vendor))) + { + comp->path_rank -= PATH_LEVEL_COMP_REF / 5 + 1; + if (strstr(comp->file_path_ref, comp->component) && strstr(comp->file_path_ref, comp->vendor)) + { + comp->path_rank -= PATH_LEVEL_COMP_REF / 2; + } + if (strstr(comp->purls[0], ".mirror")) + comp->path_rank+=PATH_LEVEL_COMP_REF / 2; + else if (strstr(comp->purls[0], "github")) + comp->path_rank--; + } + } +} /** * @brief Funtion to be called as pointer when a new compoent has to be loaded in to the list @@ -140,6 +256,7 @@ static int hint_eval(component_data_t *a, component_data_t *b) * @return true b has to be included in the list before "a" * @return false "a" wins, compare with the next component. */ + static bool component_hint_date_comparation(component_data_t *a, component_data_t *b) { if (declared_components) @@ -160,13 +277,39 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ else if (component_hint) { + scanlog("hint eval\n"); int result = hint_eval(a,b); if (result > 0) return true; if (result < 0) return false; } + + if ((engine_flags & ENABLE_PATH_HINT) && a->file_path_ref && b->file_path_ref) + { + //evalute path rank for component a + evaluate_path_rank(a); + + //evalute path rank for component b + evaluate_path_rank(b); + //The path_rank will be used as hint only when it has a reasonable value, in other cases the critea will be ignored. + if (b->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + { + if (b->path_rank - a->path_rank < 0) + { + scanlog("%s wins %s by path rank %d\n", b->purls[0], a->purls[0], b->path_rank); + return true; + } + if (b->path_rank - a->path_rank > 0) + { + scanlog("%s - %s loses %s by path rank %d/%d\n", b->purls[0],b->file, a->purls[0], b->path_rank, a->path_rank); + return false; + } + } + else if (a->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + return false; + } if (!*b->release_date) return false; if (!*a->release_date) @@ -202,39 +345,45 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return false; } - -bool add_component_from_urlid(component_list_t * component_list, uint8_t* url_id, char * path) + +bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, char *path) { uint8_t *url_rec = calloc(LDB_MAX_REC_LN, 1); /*Alloc memory for url records */ - + ldb_fetch_recordset(NULL, oss_url, url_id, false, get_oldest_url, (void *)url_rec); - /* Extract date from url_rec */ - char date[MAX_ARGLN] = "0"; - extract_csv(date, (char *)url_rec, 4, MAX_ARGLN); - /* Create a new component and fill it from the url record */ - component_data_t *new_comp = calloc(1, sizeof(*new_comp)); - bool result = fill_component(new_comp, url_id, path, (uint8_t *)url_rec); - if (result) - { - new_comp->file_md5_ref = component_list->match_ref->file_md5; - /* If the component is valid add it to the component list */ - /* The component list is a fixed size list, of size 3 by default, this means the list will keep the free oldest components*/ - /* The oldest component will be the first in the list, if two components have the same age the purl date will untie */ - new_comp->identified = IDENTIFIED_NONE; - asset_declared(new_comp); - if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) - { - scanlog("component rejected by date: %s\n",new_comp->purls[0]); - component_data_free(new_comp); /* Free if the componet was rejected */ - } - } - else + /* Extract date from url_rec */ + char date[MAX_ARGLN] = "0"; + extract_csv(date, (char *)url_rec, 4, MAX_ARGLN); + /* Create a new component and fill it from the url record */ + component_data_t *new_comp = calloc(1, sizeof(*new_comp)); + bool result = fill_component(new_comp, url_id, path, (uint8_t *)url_rec); + if (result) + { + new_comp->file_md5_ref = component_list->match_ref->file_md5; + /* If the component is valid add it to the component list */ + /* The component list is a fixed size list, of size 3 by default, this means the list will keep the free oldest components*/ + /* The oldest component will be the first in the list, if two components have the same age the purl date will untie */ + new_comp->identified = IDENTIFIED_NONE; + asset_declared(new_comp); + new_comp->file_path_ref = component_list->match_ref->scan_ower->file_path; + new_comp->path_rank = PATH_LEVEL_COMP_INIT_VALUE; + scanlog("--- new comp ---\n"); + if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) { - scanlog("incomplete component: %s\n", new_comp->component); - component_data_free(new_comp); + scanlog("component rejected: %s\n", new_comp->purls[0]); + component_data_free(new_comp); /* Free if the componet was rejected */ } - free(url_rec); + else + scanlog("component accepted: %s - pathrank: %d\n", new_comp->purls[0], new_comp->path_rank); + + } + else + { + scanlog("incomplete component: %s\n", new_comp->component); + component_data_free(new_comp); + } + free(url_rec); return true; } @@ -284,12 +433,11 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * memcpy(url_id, raw_data, MD5_LEN); char path[MAX_FILE_PATH+1]; strncpy(path, decrypted, MAX_FILE_PATH); - if (!ignored_extension(path)) + //check the ignore list only if the match type is MATCH_SNIPPET. TODO: remove this after remine everything. + if (!(component_list->match_ref->type == MATCH_SNIPPET && ignored_extension(path))) add_component_from_urlid(component_list, url_id, path); free(decrypted); - - //scanlog("#%d File %s\n", iteration, files[iteration].path); return false; } @@ -365,10 +513,11 @@ bool load_matches(match_data_t *match) if (!item->entries.le_next || !item->entries.le_next->component) break; /* if the date of two components it's the same */ - if(!strcmp(item->component->release_date, item->entries.le_next->component->release_date)) + if((!strcmp(item->component->release_date, item->entries.le_next->component->release_date) && + item->component->identified <= item->entries.le_next->component->identified)) { /* If item has no dependencies or depencencies are empty I must check the next one */ - if(!item->component->dependency_text || strlen(item->component->dependency_text) < 4) + if(!item->component->dependency_text || strlen(item->component->dependency_text) < 4) { /* if item has dependencies, stop */ if(print_dependencies(item->component)) @@ -376,6 +525,7 @@ bool load_matches(match_data_t *match) /*if the next component has dependencies, permute */ else if (print_dependencies(item->entries.le_next->component)) { + scanlog("Component permuted due to dependency tiebreak\n"); struct comp_entry *aux = item->entries.le_next->entries.le_next; LIST_INSERT_HEAD(&match->component_list.headp, item->entries.le_next, entries); item->entries.le_next = aux; @@ -580,7 +730,7 @@ void match_select_best(scan_data_t *scan) if (!scan->best_match || !scan->best_match->component_list.items || ((engine_flags & DISABLE_REPORT_IDENTIFIED) && scan->best_match->component_list.headp.lh_first->component->identified)) { scan->match_type = MATCH_NONE; - scanlog("Match without components or declared in sbom"); + scanlog("Match without components or declared in sbom\n"); } } diff --git a/src/match_list.c b/src/match_list.c index e39f52d..928c74c 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -80,7 +80,7 @@ bool component_list_add(component_list_t *list, component_data_t *new_comp, bool if (!list->headp.lh_first) { - scanlog("first component in list\n"); + scanlog("first component in list %s\n", new_comp->purls[0]); struct comp_entry *nn = calloc(1, sizeof(struct comp_entry)); /* Insert at the head. */ LIST_INSERT_HEAD(&list->headp, nn, entries); nn->component = new_comp; diff --git a/src/mz.c b/src/mz.c index 3c82cef..444e090 100644 --- a/src/mz.c +++ b/src/mz.c @@ -38,71 +38,6 @@ #include "debug.h" #include - -#ifndef MZ_DEFLATE -#include -/* This code is here to provide backward compatibility, this is duplicated in the newers versions of ldb*/ -#define CHUNK_SIZE 1024 - -int uncompress_by_chunks(uint8_t **data, uint8_t *zdata, size_t zdata_len) { - int ret; - z_stream strm; - unsigned char out[CHUNK_SIZE]; - size_t data_size = 0; // Current size of decompressed data - - // Initialize the z_stream structure - memset(&strm, 0, sizeof(strm)); - ret = inflateInit(&strm); - if (ret != Z_OK) { - fprintf(stderr, "inflateInit failed with error %d\n", ret); - exit(EXIT_FAILURE); - } - *data = malloc(CHUNK_SIZE); - // Process the compressed data - strm.avail_in = zdata_len; // Size of the compressed data - strm.next_in = zdata; - - do { - strm.avail_out = CHUNK_SIZE; - strm.next_out = out; - - ret = inflate(&strm, Z_NO_FLUSH); - if (ret == Z_STREAM_ERROR) { - fprintf(stderr, "inflate failed with error Z_STREAM_ERROR\n"); - inflateEnd(&strm); - mz_corrupted(); - } - - unsigned have = CHUNK_SIZE - strm.avail_out; - - // Realloc to increase the size of data - *data = realloc(*data, data_size + have); - if (*data == NULL) - { - fprintf(stderr, "Error reallocating memory to store decompressed data"); - inflateEnd(&strm); - exit(EXIT_FAILURE); - } - - // Copy the decompressed data to the end of data - memcpy(*data + data_size, out, have); - data_size += have; - } while (ret != Z_STREAM_END); - - // Free resources - inflateEnd(&strm); - return data_size; -} - -void mz_deflate2(struct mz_job *job) -{ - /* Decompress data */ - job->data_ln = uncompress_by_chunks((uint8_t **) &job->data, job->zdata, job->zdata_ln); - job->data_ln--; -} -#define MZ_DEFLATE(job) mz_deflate2(job) -#endif - /** * @brief Find a key and print the result * diff --git a/src/report.c b/src/report.c index 8d724b4..85c5939 100644 --- a/src/report.c +++ b/src/report.c @@ -193,22 +193,6 @@ void print_purl_array(component_data_t * component) printf("],"); } -/** - * @brief Skip the first directory name for Github and Gitlab files - * @param purl purl string - * @param file file string - * @return modified file string - */ -char *file_skip_release(char *purl, char *file) -{ - if (!(engine_flags & ENABLE_GITHUB_FULL_PATH) && (starts_with(purl, "pkg:github") || starts_with(purl, "pkg:gitlab"))) - { - return skip_first_slash(file); - } - return file; -} - - bool print_json_component(component_data_t * component) { if (!component) @@ -229,7 +213,7 @@ bool print_json_component(component_data_t * component) printf("{"); else printf(","); -/* Fetch related purls */ + /* Fetch related purls */ fetch_related_purls(component); /* Calculate main URL */ @@ -257,7 +241,9 @@ bool print_json_component(component_data_t * component) printf("\"download_url\": \"%s\",", component->url); printf("\"release_date\": \"%s\",", component->release_date); - printf("\"file\": \"%s\",", component->url_match == true ? basename(component->url) : file_skip_release(component->purls[0], component->file)); + printf("\"file\": \"%s\",", component->url_match == true ? basename(component->url) : component->file); + if (engine_flags & ENABLE_PATH_HINT) + printf("\"path_rank\": %d,", component->path_rank); char *url_id = md5_hex(component->url_md5); printf("\"url_hash\": \"%s\"", url_id); diff --git a/src/scan.c b/src/scan.c index 0a4ac23..a9ac93e 100644 --- a/src/scan.c +++ b/src/scan.c @@ -149,7 +149,7 @@ int asset_declared(component_data_t * comp) break; /* Compare purl */ - if (comp->purls[0]) + if (purl && comp->purls[0]) { scanlog("check assets with %s\n", purl); if (!strcmp((const char *) purl, (const char *) comp->purls[0])) @@ -408,19 +408,13 @@ void output_matches_json(scan_data_t *scan) * * @param scan */ -void ldb_scan(scan_data_t * scan) +void ldb_scan(scan_data_t *scan) { - bool skip = false; if (!scan) return; - if (unwanted_path(scan->file_path)) - { - skip = true; - scanlog("File %s skipped by path", scan->file_path); - } /* LDB must be available to proceed with the scan*/ - if (!ldb_table_exists(oss_file.db, oss_file.table) || !ldb_table_exists(oss_url.db, oss_url.table)) + if (!ldb_table_exists(oss_file.db, oss_file.table) || !ldb_table_exists(oss_url.db, oss_url.table)) { printf("Error: file and url tables must be present in %s KB in order to proceed with the scan\n", oss_file.db); free(scan); @@ -433,74 +427,66 @@ void ldb_scan(scan_data_t * scan) /* Get file length */ uint64_t file_size = 0; - if (!skip) - { - if (scan->preload) file_size = atoi(scan->file_size); - else file_size = get_file_size(scan->file_path); - if (file_size < 0) ldb_error("Cannot access file"); - } + + if (scan->preload) + file_size = atoi(scan->file_size); + else + file_size = get_file_size(scan->file_path); + + if (file_size < 0) + ldb_error("Cannot access file"); /* Calculate MD5 hash (if not already preloaded) */ - if (!skip) if (!scan->preload) get_file_md5(scan->file_path, scan->md5); + if (!scan->preload) + get_file_md5(scan->file_path, scan->md5); - if (!skip && extension(scan->file_path) && ignored_extension(scan->file_path)) - { - skip = true; - scanlog("File %s skipped by extension", scan->file_path); - } + /* Scan full file */ + char *tmp_md5_hex = md5_hex(scan->md5); + strcpy(scan->source_md5, tmp_md5_hex); + free(tmp_md5_hex); - /* Ignore <=1 byte */ - if (file_size <= MIN_FILE_SIZE) - { - skip = true; - scanlog("File %s skipped by file size < %d\n", scan->file_path, MIN_FILE_SIZE); - } + /* Look for full file match or url match in ldb */ + scan->match_type = ldb_scan_file(scan); - if (!skip) + /* If no match, scan snippets */ + if (scan->match_type == MATCH_NONE || force_snippet_scan) { - /* Scan full file */ - char *tmp_md5_hex = md5_hex(scan->md5); - strcpy(scan->source_md5, tmp_md5_hex); - free(tmp_md5_hex); - - /* Look for full file match or url match in ldb */ - scan->match_type = ldb_scan_file(scan); - - /* If no match, scan snippets */ - if (scan->match_type == MATCH_NONE || force_snippet_scan) + /* Load snippets into scan data */ + if (!scan->preload) { - /* Load snippets into scan data */ - if (!scan->preload) + /* Read file into memory */ + char *src = calloc(MAX_FILE_SIZE, 1); + if (file_size < MAX_FILE_SIZE) + read_file(src, scan->file_path, 0); + + /* If HPSM is enable calculate the crc8 line hash calling the shared lib */ + if (hpsm_enabled) { - /* Read file into memory */ - char *src = calloc(MAX_FILE_SIZE, 1); - if (file_size < MAX_FILE_SIZE) read_file(src, scan->file_path, 0); - - /* If HPSM is enable calculate the crc8 line hash calling the shared lib */ - if(hpsm_enabled) + char *aux = hpsm_hash_file_contents(src); + if (aux) { - char *aux = hpsm_hash_file_contents(src); - if(aux) - { - hpsm_crc_lines = strdup(&aux[5]); - free(aux); - } - } - /* Determine if file is to skip snippet search */ - if (!skip_snippets(src, file_size)) - { /* Load wfps into scan structure */ - scan->hash_count = winnowing(src, scan->hashes, scan->lines, MAX_FILE_SIZE); - if (scan->hash_count) scan->total_lines = scan->lines[scan->hash_count - 1]; + hpsm_crc_lines = strdup(&aux[5]); + free(aux); } - free(src); } - else if (scan->hash_count) scan->total_lines = scan->lines[scan->hash_count - 1]; + /* Determine if file is to skip snippet search */ + if (!skip_snippets(src, file_size)) + { /* Load wfps into scan structure */ + scan->hash_count = winnowing(src, scan->hashes, scan->lines, MAX_FILE_SIZE); + if (scan->hash_count) + scan->total_lines = scan->lines[scan->hash_count - 1]; + } + free(src); + } + else if (scan->hash_count) + scan->total_lines = scan->lines[scan->hash_count - 1]; - /* Perform snippet scan */ - if (scan->total_lines) scan->match_type = ldb_scan_snippets(scan); + /* Perform snippet scan */ + if (scan->total_lines) + scan->match_type = ldb_scan_snippets(scan); - else scanlog("File skipped\n"); - } + else + scanlog("File skipped\n"); } /* Compile matches */ @@ -508,7 +494,7 @@ void ldb_scan(scan_data_t * scan) if (!scan->best_match) scanlog("No best match\n"); - + /* Output matches */ scanlog("Match output starts\n"); if (!quiet) diff --git a/src/snippets.c b/src/snippets.c index ecbf7ec..e0e4e41 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -191,14 +191,19 @@ static bool get_all_file_ids(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8 { uint8_t *record = (uint8_t *)ptr; + if (data == NULL && datalen > 0) + { + scanlog("Error quering WFP table. datalen=%u but data is NULL\n", datalen); + uint32_write(record,0); + return true; + } + if (datalen) { uint32_t size = uint32_read(record); /* End recordset fetch if MAX_QUERY_RESPONSE is reached */ if (size + datalen + 4 >= WFP_REC_LN * MATCHMAP_ITEM_SIZE) { - //memcpy(record + size + 4, data, WFP_REC_LN * MATCHMAP_ITEM_SIZE - size); - //uint32_write(record, size + WFP_REC_LN * MATCHMAP_ITEM_SIZE); return true; } @@ -598,7 +603,6 @@ typedef struct matchmap_entry_t uint8_t wfp[WFP_LN]; uint32_t size; } matchmap_entry_t; - /** * @brief Add one new md5 to the matchmap * @param scan pointer to scan object @@ -742,6 +746,9 @@ match_t ldb_scan_snippets(scan_data_t *scan) /* First build a map with all the MD5s related with each WFP from the source file*/ matchmap_entry_t map[scan->hash_count]; + int8_t map_lines_indirection[scan->lines[scan->hash_count -1] + 1]; + memset(map_lines_indirection, -1, sizeof(map_lines_indirection)); + int lines_coverage = 0; int map_max_size = 0; for (long i = 0; i < scan->hash_count; i++) { @@ -751,13 +758,13 @@ match_t ldb_scan_snippets(scan_data_t *scan) //scanlog(" Add wfp %02x%02x%02x%02x to map\n",map[i].wfp[0], map[i].wfp[1],map[i].wfp[2],map[i].wfp[3]); uint32_write(map[i].md5_set, 0); map[i].line = scan->lines[i]; + map_lines_indirection[scan->lines[i]] = 0; ldb_fetch_recordset(NULL, oss_wfp, map[i].wfp, false, get_all_file_ids, (void *)map[i].md5_set); map[i].size = uint32_read(map[i].md5_set) / WFP_REC_LN; if (map[i].size > map_max_size) map_max_size = map[i].size; } - /* Classify the WFPs in cathegories depending on popularity Each cathegoy will contain a sub set of index refered to map rows*/ #define MAP_INDIRECTION_CAT_NUMBER 1000 @@ -789,6 +796,12 @@ match_t ldb_scan_snippets(scan_data_t *scan) map_indirection_index[cat]++; } + if (map_max_size <= 0) + { + scanlog("Warning no WFP with hits, returning failed\n"); + return MATCH_NONE; + } + /* Calculate a limit to the quantity of cathegories to be processed, the cathegoies with less quantity of MD5s (less popular) will be prioritased*/ int cat_limit = 0; @@ -803,9 +816,14 @@ match_t ldb_scan_snippets(scan_data_t *scan) continue; hashes_to_process++; cat_limit += map[map_indirection[i][j]].size; + if (map_lines_indirection[map[map_indirection[i][j]].line] == 0) + { + map_lines_indirection[map[map_indirection[i][j]].line] = 1; + lines_coverage++; + } if (cat_limit > matchmap_max_files) { - if (hashes_to_process < scan->hash_count / 10 && cat_limit < MAX_MATCHMAP_FILES) + if ((hashes_to_process < scan->hash_count / 10 || (float) lines_coverage / scan->hash_count < 0.6) && cat_limit < MAX_MATCHMAP_FILES) { matchmap_max_files += map[map_indirection[i][j]].size; } @@ -830,14 +848,23 @@ match_t ldb_scan_snippets(scan_data_t *scan) { for (int j=0; j < map_indirection_index[i]; j++) { - uint8_t * wfp = map[map_indirection[i][j]].wfp; + uint8_t * wfp = map[map_indirection[i][j]].wfp; scanlog("Cat :%d.%d - line %d - %02x%02x%02x%02x - size %d\n",i,j, map[map_indirection[i][j]].line, wfp[0], wfp[1],wfp[2],wfp[3], map[map_indirection[i][j]].size); } } + + for (int i = 0; i <= scan->lines[scan->hash_count - 1]; i++) + { + if (map_lines_indirection[i] > -1 && map_lines_indirection[i] == 0) + { + scanlog("Warning ignored line %d\n", i); + } + } } matchmap_max_files = cat_limit; - scanlog("Map limit on %d MD5s at %d of %d. Selected hashed: %d/%d - cat_limit_files = %d\n",matchmap_max_files, cat_limit_index, MAP_INDIRECTION_CAT_NUMBER, hashes_to_process, scan->hash_count, cat_limit); + scanlog("Map limit on %d MD5s at %d of %d lines. Selected hashed: %d/%d - cat_limit_files = %d - lines coverage %d\n", + matchmap_max_files, cat_limit_index, MAP_INDIRECTION_CAT_NUMBER, hashes_to_process, scan->hash_count, cat_limit, (lines_coverage * 100) / scan->hash_count); scan->matchmap = calloc(matchmap_max_files, sizeof(matchmap_entry)); int map_indexes[scan->hash_count]; @@ -951,7 +978,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) } //Free memory - for (int i = 0; i < scan->hash_count; i++) + for (int i = 0; i < scan->hash_count; i++) { free(map[i].md5_set); }