From fdba053456825b184c3243dfa982fb4ce581e274 Mon Sep 17 00:00:00 2001 From: Oliver Hecker <8004361+ohecker@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:16:49 +0200 Subject: [PATCH] Do not read content files within scancode adapter if they exceed a given size (1 Mio bytes) (#278) --- .../tools/solicitor/common/LogMessages.java | 4 +- .../FileScancodeRawComponentInfoProvider.java | 18 ++++ .../src/main/resources/application.properties | 4 + ...ScancodeRawComponentInfoProviderTests.java | 84 +++++++++++++++++++ documentation/files/application.properties | 4 + documentation/master-solicitor.asciidoc | 19 +++-- 6 files changed, 123 insertions(+), 10 deletions(-) create mode 100644 core/src/test/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProviderTests.java diff --git a/core/src/main/java/com/devonfw/tools/solicitor/common/LogMessages.java b/core/src/main/java/com/devonfw/tools/solicitor/common/LogMessages.java index 0b4afff1..6cd91c39 100644 --- a/core/src/main/java/com/devonfw/tools/solicitor/common/LogMessages.java +++ b/core/src/main/java/com/devonfw/tools/solicitor/common/LogMessages.java @@ -106,7 +106,9 @@ public enum LogMessages { "When reading yarn license info from file '{}' there was at least one patched package encountered. Processing only the base package, not the patched version."), // FAILED_READING_FILE(71, "Reading file '{}' failed"), // EMPTY_PACKAGE_URL(72, "The package URL is null or empty."), // - EMPTY_PACKAGE_PATH(73, "The package path is null or empty."); + EMPTY_PACKAGE_PATH(73, "The package path is null or empty."), // + CONTENT_FILE_TOO_LARGE(74, + "The size of the content file '{}' is '{}' (max. allowed is '{}'). Reading will be skipped."); private final String message; diff --git a/core/src/main/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProvider.java b/core/src/main/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProvider.java index c4397c60..dba3f579 100644 --- a/core/src/main/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProvider.java +++ b/core/src/main/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProvider.java @@ -38,6 +38,8 @@ public class FileScancodeRawComponentInfoProvider implements ScancodeRawComponen private String repoBasePath; + private long maxContentFileSize = 1000000L; // set this to the default even if spring is not used + private AllKindsPackageURLHandler packageURLHandler; /** @@ -63,6 +65,17 @@ public void setRepoBasePath(String repoBasePath) { this.repoBasePath = repoBasePath; } + /** + * Sets the maximum content file size. Files larger that this will not be processed to avoid OOM. + * + * @param maxContentFileSize new limit. + */ + @Value("${solicitor.scancode.max-content-file-size:1000000}") + public void setMaxContentFileSize(long maxContentFileSize) { + + this.maxContentFileSize = maxContentFileSize; + } + /** * Retrieve the {@link ScancodeRawComponentInfo} for the package given by its PackageURL. * @@ -191,6 +204,11 @@ public String retrieveContent(String packageUrl, String fileUri) { String fullFilePathAndName = IOHelper.secureFilePath(this.repoBasePath, this.packageURLHandler.pathFor(packageUrl), SOURCES_DIR, relativeFilePathAndName); File file = new File(fullFilePathAndName); + long fileSize = file.length(); + if (fileSize > this.maxContentFileSize) { + LOG.info(LogMessages.CONTENT_FILE_TOO_LARGE.msg(), fullFilePathAndName, fileSize, this.maxContentFileSize); + return null; + } try (InputStream is = new FileInputStream(file); Scanner s = new Scanner(is)) { s.useDelimiter("\\A"); String result = s.hasNext() ? s.next() : ""; diff --git a/core/src/main/resources/application.properties b/core/src/main/resources/application.properties index 901ec800..ffd53a77 100644 --- a/core/src/main/resources/application.properties +++ b/core/src/main/resources/application.properties @@ -64,6 +64,10 @@ solicitor.scancode.curations-filename=output/curations.yaml solicitor.scancode.repo-base-path=output/Source # list of patterns of found licenses which shall set the dataStatus to WITH_ISSUES solicitor.scancode.issuelistpatterns=.*unknown.* +# the maximum file size which is processed when retrieving license texts and notice file content +# from the source files of the package. This limit prevents huge memory consumption which might cause possible +# stability problems. +#solicitor.scancode.solicitor.scancode.max-content-file-size=1000000 # The curationDataSelector value to use when accessing curation data. # You can change its value to select a specific curation data source (if the implementation supports this). diff --git a/core/src/test/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProviderTests.java b/core/src/test/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProviderTests.java new file mode 100644 index 00000000..a8edbb18 --- /dev/null +++ b/core/src/test/java/com/devonfw/tools/solicitor/componentinfo/scancode/FileScancodeRawComponentInfoProviderTests.java @@ -0,0 +1,84 @@ +package com.devonfw.tools.solicitor.componentinfo.scancode; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import com.devonfw.tools.solicitor.common.packageurl.AllKindsPackageURLHandler; + +/** + * This class contains JUnit test methods for the {@link FileScancodeRawComponentInfoProvider} class. + */ +public class FileScancodeRawComponentInfoProviderTests { + + // the object under test + FileScancodeRawComponentInfoProvider fileScancodeRawComponentInfoProvider; + + @BeforeEach + public void setup() { + + AllKindsPackageURLHandler packageURLHandler = Mockito.mock(AllKindsPackageURLHandler.class); + + Mockito.when(packageURLHandler.pathFor("pkg:maven/com.devonfw.tools/test-project-for-deep-license-scan@0.1.0")) + .thenReturn("pkg/maven/com/devonfw/tools/test-project-for-deep-license-scan/0.1.0"); + + this.fileScancodeRawComponentInfoProvider = new FileScancodeRawComponentInfoProvider(packageURLHandler); + this.fileScancodeRawComponentInfoProvider.setRepoBasePath("src/test/resources/scancodefileadapter/Source/repo"); + + } + + /** + * Test the {@link FileScancodeRawComponentInfoProvider#retrieveContent(String, String)} method for a file which does + * not exceed the given default size limit (which should be 1 Mio Bytes). + */ + @Test + public void testRetrieveContentFileSizeOkForDefault() { + + // when + String fileContent = this.fileScancodeRawComponentInfoProvider.retrieveContent( + "pkg:maven/com.devonfw.tools/test-project-for-deep-license-scan@0.1.0", "pkgcontent:/NOTICE.txt"); + + // then + assertEquals("This is a dummy notice file for testing. Code is under Apache-2.0.", fileContent); + } + + /** + * Test the {@link FileScancodeRawComponentInfoProvider#retrieveContent(String, String)} method for a file which does + * not exceed the given size limit. + */ + @Test + public void testRetrieveContentFileSizeOk() { + + // given + this.fileScancodeRawComponentInfoProvider.setMaxContentFileSize(66); + + // when + String fileContent = this.fileScancodeRawComponentInfoProvider.retrieveContent( + "pkg:maven/com.devonfw.tools/test-project-for-deep-license-scan@0.1.0", "pkgcontent:/NOTICE.txt"); + + // then + assertEquals("This is a dummy notice file for testing. Code is under Apache-2.0.", fileContent); + } + + /** + * Test the {@link FileScancodeRawComponentInfoProvider#retrieveContent(String, String)} method for a file which does + * exceed the given size limit. + */ + @Test + public void testRetrieveContentFileSizeNotOk() { + + // given + this.fileScancodeRawComponentInfoProvider.setMaxContentFileSize(65); + + // when + String fileContent = this.fileScancodeRawComponentInfoProvider.retrieveContent( + "pkg:maven/com.devonfw.tools/test-project-for-deep-license-scan@0.1.0", "pkgcontent:/NOTICE.txt"); + + // then + assertNull(fileContent); + } + +} diff --git a/documentation/files/application.properties b/documentation/files/application.properties index 901ec800..ffd53a77 100644 --- a/documentation/files/application.properties +++ b/documentation/files/application.properties @@ -64,6 +64,10 @@ solicitor.scancode.curations-filename=output/curations.yaml solicitor.scancode.repo-base-path=output/Source # list of patterns of found licenses which shall set the dataStatus to WITH_ISSUES solicitor.scancode.issuelistpatterns=.*unknown.* +# the maximum file size which is processed when retrieving license texts and notice file content +# from the source files of the package. This limit prevents huge memory consumption which might cause possible +# stability problems. +#solicitor.scancode.solicitor.scancode.max-content-file-size=1000000 # The curationDataSelector value to use when accessing curation data. # You can change its value to select a specific curation data source (if the implementation supports this). diff --git a/documentation/master-solicitor.asciidoc b/documentation/master-solicitor.asciidoc index 465b24d2..9abd2cfa 100644 --- a/documentation/master-solicitor.asciidoc +++ b/documentation/master-solicitor.asciidoc @@ -1307,7 +1307,7 @@ The Generic Excel Writer exists purely for debugging purposes. This writer write "APPLICATIONCOMPONENT" : "classpath:com/devonfw/tools/solicitor/sql/allden_applicationcomponents.sql", "LICENSE" : "classpath:com/devonfw/tools/solicitor/sql/allden_normalizedlicenses.sql", "OSSLICENSES" : "classpath:com/devonfw/tools/solicitor/sql/ossapplicationcomponents.sql", - ... + ... } } ] @@ -1574,7 +1574,7 @@ artifacts: - name: pkg/npm/@somescope/somepackage/1.2.3 <1> url: https://github.com/foo/bar <2> licenseCurations: <3> - - operation: REMOVE + - operation: REMOVE path: "sources/package/readme.md" ruleIdentifier: "proprietary-license_unknown_13.RULE" matchedText: ".* to be paid .*" @@ -1603,8 +1603,8 @@ artifacts: <7> Further packages to follow. ===== Rules for curating licenses -Curating licenses is done by REMOVING (i.e. ignoring) specific license findings from ScanCode, by REPLACING the detected license with another one or by ADDING license findings either to specific files or on top level (not related to specific file of the package sources). -In addition to the conditions/data which is specific for any of the below described operations it is always possible to define a comment which is intended to be included in any audit trail log for documentation purposes (not yet used/implemented). +Curating licenses is done by REMOVING (i.e. ignoring) specific license findings from ScanCode, by REPLACING the detected license with another one or by ADDING license findings either to specific files or on top level (not related to specific file of the package sources). +In addition to the conditions/data which is specific for any of the below described operations it is always possible to define a comment which is intended to be included in any audit trail log for documentation purposes (not yet used/implemented). ====== Licenses: REMOVE @@ -1624,7 +1624,7 @@ Instead of removing licenses (ignoring the finding) they might be replaced with Data: * `newLicense` is the key / id of the license to use instead (replacing `files[].licenses[].spdx_license_key`) -* `url` is the url pointing to the license text +* `url` is the url pointing to the license text At least one of the two parameters has to be set. @@ -1658,14 +1658,14 @@ At least one of the conditions has to be defined. This follows the above principles. It uses the same conditions as REMOVE and uses a parameter to define the copyright to use instead: Data: -* `newCopyright`: The copyright entry to use instead of the originally found copyright +* `newCopyright`: The copyright entry to use instead of the originally found copyright ====== Copyright: ADD Adding new copyrights is done by defining rules which add new copyright info (to the copyrights found in a source file) - or "on top level". Conditions: -* `path` of the file within the sources (defined as a regular expression; if omitted the copyright will be applied on "top level"). +* `path` of the file within the sources (defined as a regular expression; if omitted the copyright will be applied on "top level"). Note that it is again only possible to add copyrigts to paths which are listed in the scancode json Data: @@ -1712,9 +1712,9 @@ artifacts: ===== Hierarchical definition of rules -Different version of a package/component or even different packages/components within the same namespace often require mostly the same curations to be applied. To avoid being forced to redefine curations for every single version it is possible to define curations by just specifying a prefix part in the name attribute. +Different version of a package/component or even different packages/components within the same namespace often require mostly the same curations to be applied. To avoid being forced to redefine curations for every single version it is possible to define curations by just specifying a prefix part in the name attribute. -Example of available levels/prefixes for `pkg:/maven/ch.qos.logback/logback-classic@1.2.3` +Example of available levels/prefixes for `pkg:/maven/ch.qos.logback/logback-classic@1.2.3` * `pkg` * `pkg/maven` @@ -1896,6 +1896,7 @@ Spring beans implementing this interface will be called at certain points in the [appendix] == Release Notes Changes in 1.25.0:: +* https://github.com/devonfw/solicitor/issues/277: When reading content (license texts or notice files) within the scancode adapter files which are greater than 1 million bytes will be skipped. This avoids large memory consumption and resulting instability. Changes in 1.24.2:: * https://github.com/devonfw/solicitor/pull/271: Fixed an incompatibility with JDK 8.