Skip to content

Commit

Permalink
Do not read content files within scancode adapter if they exceed a gi…
Browse files Browse the repository at this point in the history
…ven size (1 Mio bytes) (#278)
  • Loading branch information
ohecker authored Jul 25, 2024
1 parent e2a683c commit fdba053
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ public enum LogMessages {
"When reading yarn license info from file '{}' there was at least one patched package encountered. Processing only the base package, not the patched version."), //
FAILED_READING_FILE(71, "Reading file '{}' failed"), //
EMPTY_PACKAGE_URL(72, "The package URL is null or empty."), //
EMPTY_PACKAGE_PATH(73, "The package path is null or empty.");
EMPTY_PACKAGE_PATH(73, "The package path is null or empty."), //
CONTENT_FILE_TOO_LARGE(74,
"The size of the content file '{}' is '{}' (max. allowed is '{}'). Reading will be skipped.");

private final String message;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public class FileScancodeRawComponentInfoProvider implements ScancodeRawComponen

private String repoBasePath;

private long maxContentFileSize = 1000000L; // set this to the default even if spring is not used

private AllKindsPackageURLHandler packageURLHandler;

/**
Expand All @@ -63,6 +65,17 @@ public void setRepoBasePath(String repoBasePath) {
this.repoBasePath = repoBasePath;
}

/**
* Sets the maximum content file size. Files larger that this will not be processed to avoid OOM.
*
* @param maxContentFileSize new limit.
*/
@Value("${solicitor.scancode.max-content-file-size:1000000}")
public void setMaxContentFileSize(long maxContentFileSize) {

this.maxContentFileSize = maxContentFileSize;
}

/**
* Retrieve the {@link ScancodeRawComponentInfo} for the package given by its PackageURL.
*
Expand Down Expand Up @@ -191,6 +204,11 @@ public String retrieveContent(String packageUrl, String fileUri) {
String fullFilePathAndName = IOHelper.secureFilePath(this.repoBasePath, this.packageURLHandler.pathFor(packageUrl),
SOURCES_DIR, relativeFilePathAndName);
File file = new File(fullFilePathAndName);
long fileSize = file.length();
if (fileSize > this.maxContentFileSize) {
LOG.info(LogMessages.CONTENT_FILE_TOO_LARGE.msg(), fullFilePathAndName, fileSize, this.maxContentFileSize);
return null;
}
try (InputStream is = new FileInputStream(file); Scanner s = new Scanner(is)) {
s.useDelimiter("\\A");
String result = s.hasNext() ? s.next() : "";
Expand Down
4 changes: 4 additions & 0 deletions core/src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ solicitor.scancode.curations-filename=output/curations.yaml
solicitor.scancode.repo-base-path=output/Source
# list of patterns of found licenses which shall set the dataStatus to WITH_ISSUES
solicitor.scancode.issuelistpatterns=.*unknown.*
# the maximum file size which is processed when retrieving license texts and notice file content
# from the source files of the package. This limit prevents huge memory consumption which might cause possible
# stability problems.
#solicitor.scancode.solicitor.scancode.max-content-file-size=1000000

# The curationDataSelector value to use when accessing curation data.
# You can change its value to select a specific curation data source (if the implementation supports this).
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package com.devonfw.tools.solicitor.componentinfo.scancode;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;

import com.devonfw.tools.solicitor.common.packageurl.AllKindsPackageURLHandler;

/**
* This class contains JUnit test methods for the {@link FileScancodeRawComponentInfoProvider} class.
*/
public class FileScancodeRawComponentInfoProviderTests {

// the object under test
FileScancodeRawComponentInfoProvider fileScancodeRawComponentInfoProvider;

@BeforeEach
public void setup() {

AllKindsPackageURLHandler packageURLHandler = Mockito.mock(AllKindsPackageURLHandler.class);

Mockito.when(packageURLHandler.pathFor("pkg:maven/com.devonfw.tools/[email protected]"))
.thenReturn("pkg/maven/com/devonfw/tools/test-project-for-deep-license-scan/0.1.0");

this.fileScancodeRawComponentInfoProvider = new FileScancodeRawComponentInfoProvider(packageURLHandler);
this.fileScancodeRawComponentInfoProvider.setRepoBasePath("src/test/resources/scancodefileadapter/Source/repo");

}

/**
* Test the {@link FileScancodeRawComponentInfoProvider#retrieveContent(String, String)} method for a file which does
* not exceed the given default size limit (which should be 1 Mio Bytes).
*/
@Test
public void testRetrieveContentFileSizeOkForDefault() {

// when
String fileContent = this.fileScancodeRawComponentInfoProvider.retrieveContent(
"pkg:maven/com.devonfw.tools/[email protected]", "pkgcontent:/NOTICE.txt");

// then
assertEquals("This is a dummy notice file for testing. Code is under Apache-2.0.", fileContent);
}

/**
* Test the {@link FileScancodeRawComponentInfoProvider#retrieveContent(String, String)} method for a file which does
* not exceed the given size limit.
*/
@Test
public void testRetrieveContentFileSizeOk() {

// given
this.fileScancodeRawComponentInfoProvider.setMaxContentFileSize(66);

// when
String fileContent = this.fileScancodeRawComponentInfoProvider.retrieveContent(
"pkg:maven/com.devonfw.tools/[email protected]", "pkgcontent:/NOTICE.txt");

// then
assertEquals("This is a dummy notice file for testing. Code is under Apache-2.0.", fileContent);
}

/**
* Test the {@link FileScancodeRawComponentInfoProvider#retrieveContent(String, String)} method for a file which does
* exceed the given size limit.
*/
@Test
public void testRetrieveContentFileSizeNotOk() {

// given
this.fileScancodeRawComponentInfoProvider.setMaxContentFileSize(65);

// when
String fileContent = this.fileScancodeRawComponentInfoProvider.retrieveContent(
"pkg:maven/com.devonfw.tools/[email protected]", "pkgcontent:/NOTICE.txt");

// then
assertNull(fileContent);
}

}
4 changes: 4 additions & 0 deletions documentation/files/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ solicitor.scancode.curations-filename=output/curations.yaml
solicitor.scancode.repo-base-path=output/Source
# list of patterns of found licenses which shall set the dataStatus to WITH_ISSUES
solicitor.scancode.issuelistpatterns=.*unknown.*
# the maximum file size which is processed when retrieving license texts and notice file content
# from the source files of the package. This limit prevents huge memory consumption which might cause possible
# stability problems.
#solicitor.scancode.solicitor.scancode.max-content-file-size=1000000

# The curationDataSelector value to use when accessing curation data.
# You can change its value to select a specific curation data source (if the implementation supports this).
Expand Down
19 changes: 10 additions & 9 deletions documentation/master-solicitor.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -1307,7 +1307,7 @@ The Generic Excel Writer exists purely for debugging purposes. This writer write
"APPLICATIONCOMPONENT" : "classpath:com/devonfw/tools/solicitor/sql/allden_applicationcomponents.sql",
"LICENSE" : "classpath:com/devonfw/tools/solicitor/sql/allden_normalizedlicenses.sql",
"OSSLICENSES" : "classpath:com/devonfw/tools/solicitor/sql/ossapplicationcomponents.sql",
...
...
}
} ]

Expand Down Expand Up @@ -1574,7 +1574,7 @@ artifacts:
- name: pkg/npm/@somescope/somepackage/1.2.3 <1>
url: https://github.com/foo/bar <2>
licenseCurations: <3>
- operation: REMOVE
- operation: REMOVE
path: "sources/package/readme.md"
ruleIdentifier: "proprietary-license_unknown_13.RULE"
matchedText: ".* to be paid .*"
Expand Down Expand Up @@ -1603,8 +1603,8 @@ artifacts:
<7> Further packages to follow.

===== Rules for curating licenses
Curating licenses is done by REMOVING (i.e. ignoring) specific license findings from ScanCode, by REPLACING the detected license with another one or by ADDING license findings either to specific files or on top level (not related to specific file of the package sources).
In addition to the conditions/data which is specific for any of the below described operations it is always possible to define a comment which is intended to be included in any audit trail log for documentation purposes (not yet used/implemented).
Curating licenses is done by REMOVING (i.e. ignoring) specific license findings from ScanCode, by REPLACING the detected license with another one or by ADDING license findings either to specific files or on top level (not related to specific file of the package sources).
In addition to the conditions/data which is specific for any of the below described operations it is always possible to define a comment which is intended to be included in any audit trail log for documentation purposes (not yet used/implemented).


====== Licenses: REMOVE
Expand All @@ -1624,7 +1624,7 @@ Instead of removing licenses (ignoring the finding) they might be replaced with
Data:

* `newLicense` is the key / id of the license to use instead (replacing `files[].licenses[].spdx_license_key`)
* `url` is the url pointing to the license text
* `url` is the url pointing to the license text

At least one of the two parameters has to be set.

Expand Down Expand Up @@ -1658,14 +1658,14 @@ At least one of the conditions has to be defined.
This follows the above principles. It uses the same conditions as REMOVE and uses a parameter to define the copyright to use instead:

Data:
* `newCopyright`: The copyright entry to use instead of the originally found copyright
* `newCopyright`: The copyright entry to use instead of the originally found copyright

====== Copyright: ADD
Adding new copyrights is done by defining rules which add new copyright info (to the copyrights found in a source file) - or "on top level".

Conditions:

* `path` of the file within the sources (defined as a regular expression; if omitted the copyright will be applied on "top level").
* `path` of the file within the sources (defined as a regular expression; if omitted the copyright will be applied on "top level").
Note that it is again only possible to add copyrigts to paths which are listed in the scancode json

Data:
Expand Down Expand Up @@ -1712,9 +1712,9 @@ artifacts:

===== Hierarchical definition of rules

Different version of a package/component or even different packages/components within the same namespace often require mostly the same curations to be applied. To avoid being forced to redefine curations for every single version it is possible to define curations by just specifying a prefix part in the name attribute.
Different version of a package/component or even different packages/components within the same namespace often require mostly the same curations to be applied. To avoid being forced to redefine curations for every single version it is possible to define curations by just specifying a prefix part in the name attribute.

Example of available levels/prefixes for `pkg:/maven/ch.qos.logback/[email protected]`
Example of available levels/prefixes for `pkg:/maven/ch.qos.logback/[email protected]`

* `pkg`
* `pkg/maven`
Expand Down Expand Up @@ -1896,6 +1896,7 @@ Spring beans implementing this interface will be called at certain points in the
[appendix]
== Release Notes
Changes in 1.25.0::
* https://github.com/devonfw/solicitor/issues/277: When reading content (license texts or notice files) within the scancode adapter files which are greater than 1 million bytes will be skipped. This avoids large memory consumption and resulting instability.

Changes in 1.24.2::
* https://github.com/devonfw/solicitor/pull/271: Fixed an incompatibility with JDK 8.
Expand Down

0 comments on commit fdba053

Please sign in to comment.