diff --git a/src/lice_comb/impl/3rd_party.clj b/src/lice_comb/impl/3rd_party.clj deleted file mode 100644 index 813e106..0000000 --- a/src/lice_comb/impl/3rd_party.clj +++ /dev/null @@ -1,37 +0,0 @@ -;;;; lice_comb.impl.3rd_party.clj -;;; -;;; Code obtained from third party sources, but not readily available as a -;;; library via standard package-consumption mechanisms (i.e. a Maven artifact). -;;; -;;; Copyright and license information is on a per-code-snippet basis, and -;;; is communicated inline via further comments. -;;; -(ns lice-comb.impl.3rd-party) - -;; rdrop-while is copyright © Joshua Suskalo (https://github.com/IGJoshua) 2023 and licensed as "CC0-1.0 OR MIT" -;; -;; Source: https://discord.com/channels/729136623421227082/732641743723298877/1141786961875583097 -;; Link to request access: https://discord.gg/discljord -;; -;; Note that the lice-comb project elects to consume this code under the MIT license -(defn rdrop-while - "As for clojure.core/drop-while, but drops from the end of the - sequence backwards, rather than the front forwards. More efficient - when provided with a vector rather than a list." - ([pred coll] - (if (reversible? coll) - (take (- (count coll) (count (take-while pred (rseq coll)))) coll) - (reverse (drop-while pred (reverse coll))))) - ([pred] - (fn [rf] - (let [stash (volatile! [])] - (fn - ([] (rf)) - ([acc] (rf acc)) - ([acc elt] - (if (pred elt) - (do (vswap! stash conj elt) - acc) - (let [res (reduce rf acc (conj @stash elt))] - (vreset! stash []) - res)))))))) diff --git a/src/lice_comb/impl/data.clj b/src/lice_comb/impl/data.clj index cd490d9..1cd0f54 100644 --- a/src/lice_comb/impl/data.clj +++ b/src/lice_comb/impl/data.clj @@ -34,7 +34,7 @@ Use underscore ('_') instead. * Unlike during class loading, Clojure does not automatically switch hyphens in classpath resource path elements to underscores. This inconsistency can - be a time-wasting trap." + be a time-wasting foot gun." [path] (when-not (s/blank? path) (try diff --git a/src/lice_comb/impl/regex_matching.clj b/src/lice_comb/impl/id_detection.clj similarity index 97% rename from src/lice_comb/impl/regex_matching.clj rename to src/lice_comb/impl/id_detection.clj index b58aea6..c35071d 100644 --- a/src/lice_comb/impl/regex_matching.clj +++ b/src/lice_comb/impl/id_detection.clj @@ -16,9 +16,10 @@ ; SPDX-License-Identifier: Apache-2.0 ; -(ns lice-comb.impl.regex-matching - "Helper functionality focused on regex matching. Note: this namespace is not - part of the public API of lice-comb and may change without notice." +(ns lice-comb.impl.id-detection + "Helper functionality focused on detecting SPDX id(s) from a (short) string. + Note: this namespace is not part of the public API of lice-comb and may change + without notice." (:require [clojure.string :as s] [clojure.set :as set] [medley.core :as med] @@ -194,11 +195,11 @@ version (get-rencgs m ["version"] (if (= variant "LGPL") "2.0" "1.0")) version (s/replace version #"\p{Punct}+" ".") [confidence confidence-explanations] - (if (s/blank? version) - [:low #{:missing-version}] + (if version-present? (if (s/includes? version ".") [:high] - [:medium #{:partial-version}])) + [:medium #{:partial-version}]) + [:low #{:missing-version}]) version (if (s/includes? version ".") version (str version ".0")) @@ -223,7 +224,7 @@ ; The regex for the GNU family is a nightmare, so we build it up (and test it) in pieces (def agpl-re #"(?AGPL|Affero)(\s+GNU)?(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?AGPL\)?)?") -(def lgpl-re #"(?(GNU\s+(Genere?al\s+)?(Library\s+or\s+Lesser|Library|Lesser))|((Library\s+or\s+Lesser|Library|Lesser)\s+(GNU|GPL|Genere?al)|(L(esser\s)?\s*GPL)))(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?L\s*GPL\)?)?") +(def lgpl-re #"(?(GNU\s+(Genere?al\s+)?(Library\s+or\s+Lesser|Lesser\s+or\s+Library|Library|Lesser))|((Library\s+or\s+Lesser|Lesser\s+or\s+Library|Library|Lesser)\s+(GNU|GPL|Genere?al)|(L(esser\s)?\s*GPL)))(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?L\s*GPL\)?)?") (def gpl-re #"(?GNU(?!\s+Classpath)|(?\d+([\._]\d+)?)?") (def only-or-later-re #"[\s,-]*((?\(?only\)?)|(\(?or(\s+\(?at\s+your\s+(option|discretion)\)?)?(\s+any)?)?([\s-]*(?lat[eo]r|newer|greater|\+)))?") @@ -370,7 +371,7 @@ :fn (constantly ["Zlib" :high])} ]))) -(defn- match +(defn- parse-id "If a match occured for the given regex element when tested against string s, returns a map containing the following keys: * :id The SPDX license or exception identifier that was determined @@ -394,7 +395,7 @@ :start (:start match)} (when (seq confidence-explanations) {:confidence-explanations confidence-explanations}))))) -(defn matches +(defn parse-ids "Returns a sequence (NOT A SET!) of maps where each key is a SPDX license or exception identifier (a String) that was found in s, and the value is a sequence containing a single map describing how the identifier was determined. @@ -410,7 +411,7 @@ Results are in the order in which they appear in the string, and the function returns nil if there were no matches." [s] - (when-let [matches (seq (filter identity (e/pmap* (partial match s) @license-name-matching-d)))] + (when-let [matches (seq (filter identity (e/pmap* (partial parse-id s) @license-name-matching-d)))] (some->> matches (med/distinct-by :id) ;####TODO: THINK ABOUT MERGING INSTEAD OF DROPPING (sort-by :start) diff --git a/src/lice_comb/impl/matching.clj b/src/lice_comb/impl/parsing.clj similarity index 78% rename from src/lice_comb/impl/matching.clj rename to src/lice_comb/impl/parsing.clj index 755cd4f..9b0ede4 100644 --- a/src/lice_comb/impl/matching.clj +++ b/src/lice_comb/impl/parsing.clj @@ -16,9 +16,9 @@ ; SPDX-License-Identifier: Apache-2.0 ; -(ns lice-comb.impl.matching - "Matching helper functionality. Note: this namespace is not part of - the public API of lice-comb and may change without notice." +(ns lice-comb.impl.parsing + "License name, URI, and text parsing functionality. Note: this namespace is + not part of the public API of lice-comb and may change without notice." (:require [clojure.string :as s] [clojure.set :as set] [clojure.java.io :as io] @@ -28,9 +28,9 @@ [spdx.expressions :as sexp] [embroidery.api :as e] [lice-comb.impl.spdx :as lcis] - [lice-comb.impl.regex-matching :as lcirm] + [lice-comb.impl.id-detection :as lciid] + [lice-comb.impl.splitting :as lcisp] [lice-comb.impl.expressions-info :as lciei] - [lice-comb.impl.3rd-party :as lc3] [lice-comb.impl.http :as lcihttp] [lice-comb.impl.data :as lcid] [lice-comb.impl.utils :as lciu])) @@ -121,13 +121,13 @@ fix-mpl-2 fix-license-id-with-exception-id)) -(defmulti text->expressions-info +(defmulti match-text "Returns an expressions-info map for the given license text, or nil if no matches are found." {:arglists '([text])} class) -(defmethod text->expressions-info java.lang.String +(defmethod match-text java.lang.String [s] ; clj-spdx's *-within-text APIs are *expensive* but support batching, so we check batches of ids in parallel (let [num-cpus (.availableProcessors (Runtime/getRuntime)) @@ -143,36 +143,36 @@ ; Note: we don't need to sexp/normalise the keys here, as the only expressions that can be returned are constructed correctly (manual-fixes (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-matching-guidelines})) expressions-found)))))) -(defmethod text->expressions-info java.io.Reader +(defmethod match-text java.io.Reader [r] (let [sw (java.io.StringWriter.)] (io/copy r sw) - (text->expressions-info (str sw)))) + (match-text (str sw)))) -(defmethod text->expressions-info java.io.InputStream +(defmethod match-text java.io.InputStream [is] - (text->expressions-info (io/reader is))) + (match-text (io/reader is))) -(defmethod text->expressions-info :default +(defmethod match-text :default [src] (when src (with-open [r (io/reader src)] - (doall (text->expressions-info r))))) + (doall (match-text r))))) -(defn uri->expressions-info - "Returns an expressions-info map for the given license uri, or nil if no - matches are found." +(defn parse-uri + "Parses the given license `uri`, returning an expressions-info map, or `nil` + if no matching license ids were found." [uri] (when-not (s/blank? uri) (let [result (manual-fixes - (let [suri (lciu/simplify-uri uri)] - (or ; 1. Does the simplified URI match any of the simplified URIs in the SPDX license or exception lists? - (when-let [ids (get @lcis/index-uri-to-id-d suri)] - (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids))) - - ; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it - (when-let [license-text (lcihttp/get-text uri)] - (text->expressions-info license-text)))))] + (or + ; 1. Is the URI a close match for any of the URIs in the SPDX license or exception lists? + (when-let [ids (lcis/near-match-uri uri)] + (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids))) + + ; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it + (when-let [license-text (lcihttp/get-text uri)] + (match-text license-text))))] ; We don't need to sexp/normalise the keys here, as we never detect an expression from a URI (lciei/prepend-source uri result)))) @@ -194,64 +194,27 @@ (map #(apply hash-map %) cursed-name)) ; 2. Is it an SPDX license or exception id? - (when-let [id (get @lcis/spdx-ids-d (s/lower-case s))] + (when-let [id (lcis/near-match-id s)] (if (= id s) (list {id (list {:id id :type :declared :strategy :spdx-listed-identifier-exact-match :source (list s)})}) (list {id (list {:id id :type :concluded :confidence :high :strategy :spdx-listed-identifier-case-insensitive-match :source (list s)})}))) ; 3. Is it the name of one or more SPDX licenses or exceptions? - (when-let [ids (get @lcis/index-name-to-id-d (s/lower-case s))] + (when-let [ids (lcis/near-match-name s)] (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-name :source (list s)})) ids)) ; 4. Might it be a URI? (this is to handle some dumb corner cases that exist in pom.xml files hosted on Clojars & Maven Central) - (when-let [ids (uri->expressions-info s)] + (when-let [ids (parse-uri s)] (map #(hash-map (key %) (val %)) ids)) - ; 5. Attempt regex name matching - (lcirm/matches s) + ; 5. Attempt to parse ids from the name + (lciid/parse-ids s) - ; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value instead of an SPDX license or exception identifier + ; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value (NOT A LICENSEREF!) instead of an SPDX license or exception identifier (let [id (str "UNIDENTIFIED-" s)] (list {id (list {:id id :type :concluded :confidence :low :confidence-explanations [:unidentified] :strategy :unidentified :source (list s)})})))] (map (partial lciei/prepend-source s) ids)))) -(defn- filter-blanks - "Filter blank strings out of coll" - [coll] - (when (seq coll) - (seq (filter #(or (not (string? %)) (not (s/blank? %))) coll)))) - -(defn- map-split-and-interpose - "Maps over the given sequence, splitting strings using the given regex re and - interposing the given value inter, returning a (flattened) sequence." - [re inter coll] - (mapcat #(if-not (string? %) - [%] - (let [splits (s/split % re)] - (if (nil? inter) - splits - (interpose inter splits)))) - coll)) - -(defn split-on-operators - "Case insensitively splits a string based on license operators (and, - or, with), but only if they're not also part of a license name (e.g. - 'Common Development and Distribution License', 'GNU General Public - License version 2.0 or (at your option) any later version', etc.)." - [s] - (when-not (s/blank? s) - (->> (s/split (s/trim s) #"(?i)\band[/-\\]+or\b") - (map-split-and-interpose #"(?i)(\band\b|\&)(?!\s+(distribution|all\s+rights\s+reserved))" - :and) - (map-split-and-interpose #"(?i)\bor\b(?!\s*(-?(greater|(any\s+)?later|(any\s+)?lator|(any\s+)?newer|lesser|library|\(?at\s+your\s+(option|discretion)\)?|([\"']?(Revised|Modified)[\"']?))))" - :or) - (map-split-and-interpose #"(?i)\b(with\b|w/)(?!\s+the\s+acknowledgment\s+clause\s+removed)" - :with) - (map-split-and-interpose #"(?i)(?<=CDDL)/(?=GPL)" ; Special case for splitting particularly cursed combos such as CDDL/GPLv2+CE - nil) - filter-blanks - (map #(if (string? %) (s/trim %) %))))) - (defn- fix-unidentified "Fixes a singleton UNIDENTIFIED- expression info map by converting the id to either a lice-comb unidentified LicenseRef or AdditionRef, depending on prev. @@ -341,16 +304,13 @@ (recur (process-expression-element result f) (first r) (rest r)) (manual-fixes (into {} result))))) -(defn name->expressions-info - "Returns an expressions-info map for the given license name." +(defn parse-name + "Parses the given license `n`ame, returning an expressions-info map." [n] (when-not (s/blank? n) (let [n (s/trim n) partial-result (some->> n - split-on-operators ; Split on operators - (drop-while keyword?) ; Drop (nonsensical) leading operators - (lc3/rdrop-while keyword?) ; Drop (nonsensical) trailing operators - dedupe ; Deduplicate consecutive identical values (mostly applies to duplicate operators, which are redundant) + lcisp/split-on-operators ; Split on operators (map #(if (keyword? %) % (string->ids-info %))) ; Determine SPDX ids (or UNIDENTIFIED-xxx) with info for all non-operators flatten ; Flatten back to an unnested sequence (since string->ids-info returns sequences) fix-unidentifieds ; Convert each unidentified non-operator into either a LicenseRef or AdditionRef, depending on context @@ -374,7 +334,7 @@ Note: this method has a substantial performance cost." [] (lcis/init!) - (lcirm/init!) + (lciid/init!) (lcihttp/init!) @cursed-names-d nil) diff --git a/src/lice_comb/impl/spdx.clj b/src/lice_comb/impl/spdx.clj index 66a7378..c0c15cc 100644 --- a/src/lice_comb/impl/spdx.clj +++ b/src/lice_comb/impl/spdx.clj @@ -20,6 +20,7 @@ "SPDX-related functionality. Note: this namespace is not part of the public API of lice-comb and may change without notice." (:require [clojure.string :as s] + [embroidery.api :as e] [spdx.licenses :as sl] [spdx.exceptions :as se] [spdx.expressions :as sexp] @@ -52,15 +53,28 @@ (def ^:private unidentified-addition-ref-prefix (str lice-comb-addition-ref-prefix "-UNIDENTIFIED")) ; Lower case id map -(def spdx-ids-d (delay (merge (into {} (map #(vec [(s/lower-case %) %]) @license-ids-d)) - (into {} (map #(vec [(s/lower-case %) %]) @exception-ids-d))))) +(def ^:private spdx-ids-d (delay (merge (into {} (map #(vec [(s/lower-case %) %]) @license-ids-d)) + (into {} (map #(vec [(s/lower-case %) %]) @exception-ids-d))))) + +(defn near-match-id + "Returns the (case-corrected) id for the given license or exception id `id`, + or `nil` if one wasn't found." + [id] + (get @spdx-ids-d (s/lower-case id))) (defn- name-to-id-tuple [list-entry] [(s/lower-case (s/trim (:name list-entry))) (:id list-entry)]) -(def index-name-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @license-list-d))) - (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @exception-list-d)))))) +(def ^:private index-name-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @license-list-d))) + (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @exception-list-d)))))) + +;####TODO: REPLACE THIS WITH REGEX BASED NEAR-MATCHING (to account for whitespace variance and #"licen[cs]e", for example) +(defn near-match-name + "Returns the id(s) for the given license or exception name `n`, or `nil` if + no ids were found." + [n] + (get @index-name-to-id-d (s/lower-case n))) (defn- urls-to-id-tuples "Extracts all urls for a given list (license or exception) entry." @@ -69,8 +83,14 @@ simplified-uris (map lciu/simplify-uri (filter (complement s/blank?) (concat (:see-also list-entry) (get-in list-entry [:cross-refs :url]))))] (map #(vec [% id]) simplified-uris))) -(def index-uri-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @license-list-d))) - (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @exception-list-d)))))) +(def ^:private index-uri-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @license-list-d))) + (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @exception-list-d)))))) + +(defn near-match-uri + "Returns the id(s) for the given license or exception `uri`, or `nil` if no + ids were found." + [uri] + (get @index-uri-to-id-d (lciu/simplify-uri uri))) (defn lice-comb-license-ref? "Is the given id one of lice-comb's custom LicenseRefs?" @@ -209,8 +229,8 @@ Note: this method has a substantial performance cost." [] ; Parallelise initialisation of the spdx.licenses and spdx.exceptions namespaces, as they're both sloooooooow (~1.5 mins total) - (let [sl-init (future (sl/init!)) - se-init (future (se/init!))] + (let [sl-init (e/future* (sl/init!)) + se-init (e/future* (se/init!))] @sl-init @se-init) (sexp/init!) diff --git a/src/lice_comb/impl/splitting.clj b/src/lice_comb/impl/splitting.clj new file mode 100644 index 0000000..6d5d54d --- /dev/null +++ b/src/lice_comb/impl/splitting.clj @@ -0,0 +1,213 @@ +; +; Copyright © 2024 Peter Monks +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +; +; SPDX-License-Identifier: Apache-2.0 +; + +(ns lice-comb.impl.splitting + "Splitting helper functionality. Note: this namespace is not part of + the public API of lice-comb and may change without notice." + (:require [clojure.string :as s] + [lice-comb.impl.utils :as lciu])) + +(def op-re #"\s+(and[/\-\\]or|and|&|or|w/|with|\s+)+\s+") +(def leading-op-re (lciu/re-concat "(?i)\\A" op-re)) +(def trailing-op-re (lciu/re-concat "(?i)" op-re "\\z")) + +(defn strip-leading-and-trailing-operators + "Strips all leading and trailing operators (and, or, and/or, with) from `s`." + [s] + (when s + (let [new-s (-> (str " " s " ") ; Ensure s has leading and trailing whitespace, as op-re requires it to exist + (s/replace leading-op-re "") + (s/replace trailing-op-re ""))] + (when-not (s/blank? new-s) + (s/trim new-s))))) + +(def ^:private and-or-splitting-re #"(?i)(?<=\s)and[/\-\\]or(?=\s)") +(def ^:private abbreviated-with-splitting-re #"(?i)\bw/") +(def ^:private abbreviated-and-splitting-re #"(?i)&") +(def ^:private and-or-with-splitting-re #"(?i)\b((and|&)(?![/\-\\]or)|(?> (lciu/retained-split (strip-leading-and-trailing-operators s) and-or-splitting-re) + (mapcat #(lciu/retained-split % abbreviated-with-splitting-re)) + (mapcat #(lciu/retained-split % abbreviated-and-splitting-re)) + (mapcat #(lciu/retained-split % and-or-with-splitting-re))))) + +(defn- tag-operators + "Tags potential operators in `coll` (a sequence of `String`s), by turning + them into tuples of `[keyword original-value]`. The keywords are: + * :and + * :or + * :with + * :and-or" + [coll] + (map #(case (s/lower-case %) + ("and" "&") [:and %] + "or" [:or %] + ("with" "w/") [:with %] + ("and/or" "and-or" "and\\or") [:and-or %] + %) + coll)) + +(defn- any-re-pairs-match? + "Do any of the regular expression pairs in `re-pairs` match `before-text` (1st + regular expression in the pair) and also `after-text` (2nd regular expression + in the pair)?" + [before-text after-text re-pairs] + (boolean (some identity (map #(and (re-find (first %) before-text) + (re-find (second %) after-text)) + re-pairs)))) + +;####TODO: build these programmatically from the SPDX license and exception lists +; Relevant names can be found with this code: +; (let [all-names (concat (map :name @lcis/license-list-d) (map :name @lcis/exception-list-d)) +; relevant-names (filter #(> (count %) 1) (map naive-operator-split all-names))] + +(def ^:private re-and-before-after-pairs [ + ; :and operator checks + [#"(?i)\bcopyright\s*\z" #"(?i)\A\s*all\s+rights\s+reserved\b"] + [#"(?i)\bcommon\s+development\s*\z" #"(?i)\A\s*distribution\s+licen[cs]e\b"] + [#"(?i)\bBSD\s+with\s+attribution\s*\z" #"(?i)\A\s*HPND\s+disclaimer\b"] + [#"(?i)\bHistorical\s+Permission\s+Notice\s*\z" #"(?i)\A\s*Disclaimer\b"] + [#"(?i)\bHPND\s+with\s+US\s+Government\s+export\s+control\s+warning\s*\z" #"(?i)\A\s*(acknowledgment|modification)\b"] + [#"(?i)\bHistorical\s+Permission\s+Notice\s*\z" #"(?i)\A\s*Disclaimer\b"] + [#"(?i)\bIBM\s+PowerPC\s+Initialization\s*\z" #"(?i)\A\s*Boot\s+Software\b"] + [#"(?i)\bLZMA\s+SDK\s+Licen[cs]e\s+\(?versions?\s+ß\d\.\d\d\s*\z" #"(?i)\A\s*beyond\)?\b"] + [#"(?i)\bNara\s+Institute\s+of\s+Science\s*\z" #"(?i)\A\s*Technology\s+Licen[cs]e\b"] + [#"(?i)\bOpen\s+LDAP\s+Public\s+Licen[cs]e\s+v2\.0\s+\(?or\s+possibly\s+2\.0A\s*\z" #"(?i)\A\s*2.0B\)?\b"] + [#"(?i)\bUnicode\s+Licen[cs]e\s+Agreement\s+[\-\s]*Data\s+Files\s*\z" #"(?i)\A\s*Software\b"] + [#"(?i)\bW3C\s+Software\s+Notice\s*\z" #"(?i)\A\s*(Document\s+)?Licen[cs]e\b"] + [#"(?i)\bbzip2\s*\z" #"(?i)\A\s*libbzip2\s+Licen[cs]e\b"]]) + +(defn- non-operator-and-within-text? + "Was there a a non-operator use of `and` between the two texts?" + [before-text after-text] + (any-re-pairs-match? before-text after-text re-and-before-after-pairs)) + +(def ^:private re-or-before-after-pairs [ + ; Original regexes + [#"(?i).+\z" #"(?i)\A[\s\-]*(greater|(any\s+)?lat[eo]r|(any\s+)?newer|\(?at\s+your\s+(option|discretion)\)?|([\"']?(Revised|Modified)[\"']?))"] + [#"(?i)\blesser\s*\z" #"(?i)\A[\s\-]*library\b"] + [#"(?i)\blibrary\s*\z" #"(?i)\A[\s\-]*lesser\b"] + ; Names from SPDX license list + [#"(?i)\bBSD\s+3-Clause\s+\"?New\"?\s*\z" #"(?i)\A\s*\"?Revised\"?\s+Licen[cs]e\b"] + [#"(?i)\bBSD\s+4-Clause\s+\"?Original\"?\s*\z" #"(?i)\A\s*\"?Old\"?\s+Licen[cs]e\b"] + [#"(?i)\bOpen\s+LDAP\s+Public\s+Licen[cs]e\s+v2\.0\s+\(\s*\z" #"(?i)\A\s*possibly\s+2\.0A\b"]]) + +(defn- non-operator-or-within-text? + "Was there a a non-operator use of `or` between the two texts?" + [before-text after-text] + (any-re-pairs-match? before-text after-text re-or-before-after-pairs)) + +(def ^:private re-with-before-after-pairs [ + ; Original regexes + [#"(?i).+\z" #"(?i)\A\s*the\s+acknowledgment\s+clause\s+removed"] + ; Names from SPDX license list + [#"(?i)\bBSD\s*\z" #"(?i)\A\s*Attribution\b"] + [#"(?i)\bFSF\s+Unlimited\s+Licen[cs]e(\s+\()?\s*\z" #"(?i)\A\s*Licen[cs]e\s+Retention\b"] + [#"(?i)\bGood\s+Luck\s*\z" #"(?i)\A\s*That\s+Public\s+Licen[cs]e\b"] + [#"(?i)\bHPND\s*\z" #"(?i)\A\s*US\s+Government\s+export\s+control\b"] + [#"(?i)\bHPND\s+sell\s+variant\s*\z" #"(?i)\A\s*MIT\s+disclaimer\b"] + [#"(?i)\bHistorical\s+Permission\s+Notice\s+and\s+Disclaimer(\s+-\s+sell\s+xserver\s+variant)?\s*\z" #"(?i)\A\s*MIT\s+disclaimer\b"] + [#"(?i)\bANTLR\s+Software\s+Rights\s+Notice\s*\z" #"(?i)\A\s*licen[cs]e\s+fallback\b"] + [#"(?i)\bLatex2e\s*\z" #"(?i)\A\s*translated\s+notice\s+permission\b"] + [#"(?i)\bNIST\s+Public\s+Domain\s+Notice\s*\z" #"(?i)\A\s*licen[cs]e\s+fallback\b"] + [#"(?i)\bSIL\s+Open\s+Font\s+Licen[cs]e\s+\d+\.\d+\s*\z" #"(?i)\A\s*(no\s+)?Reserved\s+Font\s+Name\b"] + [#"(?i)\bzlib/libpng\s+Licen[cs]e\s*\z" #"(?i)\A\s*Acknowledgement\b"]]) + +(defn- non-operator-with-within-text? + "Was there a a non-operator use of `with` between the two texts?" + [before-text after-text] + (any-re-pairs-match? before-text after-text re-with-before-after-pairs)) + +(defn- invalid-operator? + "Returns `true` if an invalid operator was identified. Examples include: + * `GNU Lesser or Library Public License` + * `Common Development and Distribution License`" + [before-text op after-text] + (case op + :and (non-operator-and-within-text? before-text after-text) + :or (non-operator-or-within-text? before-text after-text) + :with (non-operator-with-within-text? before-text after-text) + false)) + +(defn- validate-operators + "Validates operators (identified in `coll` as a tuple - see + `identify-operators` for details), by either: + * replacing it with its associated keyword, if it's a valid operator + * recombining it with its neighboring values, if it's not a valid operator + (e.g. `[\"GNU Lesser \" [:or \"or\"] \" Library Public License\"]`) + * removing it if it's an :and-or" + [coll] + (loop [result [] + before-text (first coll) + op (second coll) + after-text (nth coll 2 nil) + r (nthrest coll 3)] + (if-not before-text + ; Base case - clean result and return + (map #(if (string? %) (s/trim %) %) result) + ; Recursive case + (if (string? after-text) + ; Normal case: string op string + (let [[op-kw op-str] op] + (if (invalid-operator? before-text op-kw after-text) + (let [new-before-text (str before-text op-str after-text) ; not a valid operator, so recombine the before and after text + new-result (conj (vec (take (dec (count result)) result)) new-before-text)] + (recur new-result new-before-text (first r) (second r) (nthrest r 2))) + (let [new-result (if (= op-kw :and-or) + (concat result [before-text after-text]) ; and/or "operator", so drop the operator but retain the split + (concat result [before-text op-kw after-text]))] ; valid operator, so insert the operator's keyword + (recur new-result after-text (first r) (second r) (nthrest r 2))))) + + ; Unusual case: string op op + (recur (conj (vec result) before-text) (nth r 2 nil) (nth r 3 nil) (nth r 4 nil) (nthrest r 4)))))) + +(def ^:private cursed-re #"(?i)(?<=CDDL)/(?=GPL)") ; Special case for splitting particularly cursed combos such as CDDL/GPLv2+CE) + +(defn- split-cursed + "Split any strings in `coll` that contain particularly cursed values." + [coll] + (mapcat #(if (string? %) + (s/split % cursed-re) + [%]) + coll)) + +(defn- filter-blank-strings + "Removes blank strings from `coll`, but keeps everything else, including + elements that are not strings." + [coll] + (filter #(or (not (string? %)) (not (s/blank? %))) coll)) + +(defn split-on-operators + "Case insensitively splits a string based on license operators (and, + or, with), but only if they're not also part of a license name (e.g. + 'Common Development and Distribution License', 'GNU General Public + License version 2.0 or (at your option) any later version', etc.)." + [s] + (when-not (s/blank? s) + (->> s + naive-operator-split + tag-operators + validate-operators + split-cursed + filter-blank-strings + dedupe))) diff --git a/src/lice_comb/impl/utils.clj b/src/lice_comb/impl/utils.clj index 0183048..e21ee72 100644 --- a/src/lice_comb/impl/utils.clj +++ b/src/lice_comb/impl/utils.clj @@ -102,18 +102,40 @@ [& res] (re-pattern (s/join res))) +(defn retained-split + "As for `clojure.string/split`, but retains whatever `re` matched as distinct + elements in the result." + [s re] + (let [m (re-matcher re s) + split-indices (loop [result [] + f (.find m)] + (if f + (recur (concat result [(.start m) (.end m)]) (.find m)) + (when-not (empty? result) + (dedupe (concat [0] result [(count s)])))))] + (if (empty? split-indices) + [s] + (mapv #(subs s (first %) (second %)) (partition 2 1 split-indices))))) + +(def ^java.nio.charset.Charset utf8-charset java.nio.charset.StandardCharsets/UTF_8) + +(defn utf8-bytes + "The UTF-8 encoded bytes of `s` (a `String`), as a Java `byte[]`." + [^String s] + (.getBytes s utf8-charset)) + (defn base62-encode "Encodes the given string to Base62/UTF-8." [^String s] (when s - (base62/encode (.getBytes s java.nio.charset.StandardCharsets/UTF_8)))) + (base62/encode (utf8-bytes s)))) (defn base62-decode "Decodes the given Base62/UTF-8 string." [^String s] (when s (if (re-matches #"\p{Alnum}*" s) - (java.lang.String. ^bytes (base62/decode s) java.nio.charset.StandardCharsets/UTF_8) + (java.lang.String. ^bytes (base62/decode s) utf8-charset) (throw (ex-info (str "Invalid BASE62 value provided: " s) {}))))) ; Because clj-base62 has crappy error messages (defn html->text diff --git a/src/lice_comb/matching.clj b/src/lice_comb/matching.clj index 8c02446..c9af0a5 100644 --- a/src/lice_comb/matching.clj +++ b/src/lice_comb/matching.clj @@ -60,13 +60,13 @@ starting from the most general (the input) through to the most specific (the smallest subset of the input that was used to make this determination)." - (:require [clojure.string :as s] - [spdx.licenses :as sl] - [spdx.exceptions :as se] - [spdx.expressions :as sexp] - [lice-comb.impl.spdx :as lcis] - [lice-comb.impl.matching :as lcim] - [lice-comb.impl.utils :as lciu])) + (:require [clojure.string :as s] + [spdx.licenses :as sl] + [spdx.exceptions :as se] + [spdx.expressions :as sexp] + [lice-comb.impl.spdx :as lcis] + [lice-comb.impl.parsing :as lcip] + [lice-comb.impl.utils :as lciu])) (defn lice-comb-license-ref? "Is the given id one of lice-comb's custom LicenseRefs?" @@ -124,7 +124,7 @@ * you cannot pass a `String` representation of a filename to this method - you should pass filenames through `clojure.java.io/file` (or similar) first" [text] - (lcim/text->expressions-info text)) + (lcip/match-text text)) (defn text->expressions "Returns a set of SPDX expressions (`String`s) for `text`. See @@ -151,7 +151,7 @@ 2. URIs in the SPDX license and exception lists are not unique - the same URI may represent multiple licenses and/or exceptions." [uri] - (lcim/uri->expressions-info uri)) + (lcip/parse-uri uri)) (defn uri->expressions "Returns a set of SPDX expressions (`String`s) for `uri`. See @@ -162,28 +162,31 @@ set)) (defn name->expressions-info - "Returns an expressions-info map for `name` (a `String`), or `nil` if no + "Returns an expressions-info map for `n` (a `String`), or `nil` if no expressions were found. This involves: - 1. Determining whether `name` is a valid SPDX license expression, and if so + 1. Determining whether `n` is a valid SPDX license expression, and if so normalising it (see [clj-spdx's `spdx.expressions/normalise` fn](https://pmonks.github.io/clj-spdx/spdx.expressions.html#var-normalise)) - 2. Checking if `name` is actually a URI, and if so performing URL matching + 2. Checking if `n` is actually a URI, and if so performing URL matching on it via [[uri->expressions-info]] - 3. attempting to parse `name` to construct one or more SPDX license + 3. attempting to parse `n` to construct one or more SPDX license expressions" - [name] - (when-not (s/blank? name) - (let [name (s/trim name)] + [n] + (when-not (s/blank? n) + (let [n (s/trim n)] ; 1. If it's a valid SPDX expression, return the normalised rendition of it - (if-let [normalised-expression (sexp/normalise name)] - {normalised-expression (list {:type :declared :strategy :spdx-expression :source (list name)})} - ; 2. If it's a URI, use URI matching (this is to handle messed up real world cases where license names in POMs contain a URI) - (if (lciu/valid-http-uri? name) - (if-let [ids (uri->expressions-info name)] - ids - {(lcis/name->unidentified-license-ref name) (list {:type :concluded :confidence :low :strategy :unidentified :source (list name)})}) ; It was a URL, but we weren't able to resolve it to any ids, so return it as unidentified - ; 3. Attempt to build SPDX expression(s) from the name - (lcim/name->expressions-info name)))))) + (if-let [normalised-expression (sexp/normalise n)] + {normalised-expression (list {:type :declared :strategy :spdx-expression :source (list n)})} + ; 2. Is it a listed license or exception name? + (if-let [ids (lcis/near-match-name n)] + (into {} (map #(vec [% (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-name :source (list n)})]) ids)) + ; 3. If it's a URI, use URI matching (this is to handle messed up real world cases where license names in POMs contain a URI) + (if (lciu/valid-http-uri? n) + (if-let [ids (uri->expressions-info n)] + ids + {(lcis/name->unidentified-license-ref n) (list {:type :concluded :confidence :low :strategy :unidentified :source (list n)})}) ; It was a URL, but we weren't able to resolve it to any ids, so return it as unidentified + ; 4. Attempt to parse the name + (lcip/parse-name n))))))) (defn name->expressions "Returns a set of SPDX expressions (`String`s) for `name`. See @@ -202,5 +205,5 @@ Note: this method may have a substantial performance cost." [] (lcis/init!) - (lcim/init!) + (lcip/init!) nil) diff --git a/src/lice_comb/maven.clj b/src/lice_comb/maven.clj index 18f5935..46ece31 100644 --- a/src/lice_comb/maven.clj +++ b/src/lice_comb/maven.clj @@ -40,7 +40,7 @@ [clojure.tools.logging :as log] [xml-in.core :as xi] [lice-comb.matching :as lcm] - [lice-comb.impl.matching :as lcim] + [lice-comb.impl.parsing :as lcip] [lice-comb.impl.expressions-info :as lciei] [lice-comb.impl.http :as lcihttp] [lice-comb.impl.utils :as lciu])) @@ -322,7 +322,7 @@ (map licenses-from-pair) (filter identity) (into (array-map)) ; We force the use of an array-map here to preserve order - lcim/manual-fixes + lcip/manual-fixes create-single-expression)] license-ei) ; License block doesn't exist, so attempt to lookup the parent pom and try again diff --git a/test/lice_comb/impl/regex_matching_test.clj b/test/lice_comb/impl/id_parsing_test.clj similarity index 93% rename from test/lice_comb/impl/regex_matching_test.clj rename to test/lice_comb/impl/id_parsing_test.clj index 739079b..ba13c0a 100644 --- a/test/lice_comb/impl/regex_matching_test.clj +++ b/test/lice_comb/impl/id_parsing_test.clj @@ -16,13 +16,13 @@ ; SPDX-License-Identifier: Apache-2.0 ; -(ns lice-comb.impl.regex-matching-test - (:require [clojure.test :refer [deftest testing is use-fixtures]] - [clojure.set :as set] - [rencg.api :as rencg] - [lice-comb.impl.utils :as lcu] - [lice-comb.test-boilerplate :refer [fixture testing-with-data]] - [lice-comb.impl.regex-matching :refer [init! version-re only-or-later-re agpl-re lgpl-re gpl-re gnu-re matches]])) +(ns lice-comb.impl.id-parsing-test + (:require [clojure.test :refer [deftest testing is use-fixtures]] + [clojure.set :as set] + [rencg.api :as rencg] + [lice-comb.impl.utils :as lcu] + [lice-comb.test-boilerplate :refer [fixture testing-with-data]] + [lice-comb.impl.id-parsing :refer [init! version-re only-or-later-re agpl-re lgpl-re gpl-re gnu-re parse-ids]])) (use-fixtures :once fixture) @@ -69,6 +69,7 @@ "GNU LGPL version 3" '("LGPL-3.0-only") "GNU LGPL-3.0" '("LGPL-3.0-only") "GNU LGPLv3 " '("LGPL-3.0-only") + "GNU Lesser" '("LGPL-2.0-or-later") "GNU Lesser GPL" '("LGPL-2.0-or-later") "GNU Lesser General Public Licence" '("LGPL-2.0-or-later") "GNU Lesser General Public Licence 3.0" '("LGPL-3.0-only") @@ -92,12 +93,15 @@ "GNU Lesser General Public License, version 3 or greater" '("LGPL-3.0-or-later") "GNU Lesser General Public License, version 3.0 or (at your option) any later version" '("LGPL-3.0-or-later") "GNU Lesser General Pulic License v2.1" '("LGPL-2.1-only") - "GNU Lesser Genereal Public License" '("LGPL-2.0-or-later") + "GNU Lesser Genereal Public License" '("LGPL-2.0-or-later") ; Note messed up spelling of "general" "GNU Lesser Public License" '("LGPL-2.0-or-later") + "GNU Library" '("LGPL-2.0-or-later") "GNU Library General Public License" '("LGPL-2.0-or-later") + "GNU Lesser or Library General Public License (LGPL)" '("LGPL-2.0-or-later") "GNU Library or Lesser General Public License (LGPL)" '("LGPL-2.0-or-later") "GNU Library or Lesser General Public License (LGPL) 2.1" '("LGPL-2.1-only") "GNU Library or Lesser General Public License (LGPL) V2.1" '("LGPL-2.1-only") + "GNU Lesser or Library General Public License (LGPL) V2.1" '("LGPL-2.1-only") "Gnu Lesser Public License" '("LGPL-2.0-or-later") "L GPL 3" '("LGPL-3.0-only") "LGPL" '("LGPL-2.0-or-later") @@ -112,6 +116,9 @@ "LGPLv3" '("LGPL-3.0-only") "LGPLv3+" '("LGPL-3.0-or-later") "Lesser GPL" '("LGPL-2.0-or-later") + "Lesser GNU" '("LGPL-2.0-or-later") + "Library GPL" '("LGPL-2.0-or-later") + "Library GNU" '("LGPL-2.0-or-later") "Lesser General Public License" '("LGPL-2.0-or-later") "Lesser General Public License (LGPL)" '("LGPL-2.0-or-later") "Licensed under GNU Lesser General Public License Version 3 or later (the " '("LGPL-3.0-or-later") @@ -241,5 +248,5 @@ (is (every? not-nil? (map (partial test-regex gnu-re) gnu-licenses))))) (deftest match-regexes-tests - (testing-with-data "GNU Family Regexes - correct identifier results" #(mapcat keys (matches %)) gnu-licenses-and-ids) - (testing-with-data "CC Family Regexes - correct identifier results" #(mapcat keys (matches %)) cc-by-licenses-and-ids)) + (testing-with-data "GNU Family Regexes - correct identifier results" #(mapcat keys (parse-ids %)) gnu-licenses-and-ids) + (testing-with-data "CC Family Regexes - correct identifier results" #(mapcat keys (parse-ids %)) cc-by-licenses-and-ids)) diff --git a/test/lice_comb/impl/matching_test.clj b/test/lice_comb/impl/matching_test.clj deleted file mode 100644 index 54b907c..0000000 --- a/test/lice_comb/impl/matching_test.clj +++ /dev/null @@ -1,71 +0,0 @@ -; -; Copyright © 2023 Peter Monks -; -; Licensed under the Apache License, Version 2.0 (the "License"); -; you may not use this file except in compliance with the License. -; You may obtain a copy of the License at -; -; http://www.apache.org/licenses/LICENSE-2.0 -; -; Unless required by applicable law or agreed to in writing, software -; distributed under the License is distributed on an "AS IS" BASIS, -; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -; See the License for the specific language governing permissions and -; limitations under the License. -; -; SPDX-License-Identifier: Apache-2.0 -; - -(ns lice-comb.impl.matching-test - (:require [clojure.test :refer [deftest testing is use-fixtures]] - [lice-comb.test-boilerplate :refer [fixture]] - [lice-comb.impl.matching :refer [split-on-operators]])) - -(use-fixtures :once fixture) - -(deftest split-on-operators-tests - (testing "nil/empty/blank" - (is (nil? (split-on-operators nil))) - (is (nil? (split-on-operators ""))) - (is (nil? (split-on-operators " ")))) - (testing "Simple non-splits" - (is (= '("foo") (split-on-operators "foo"))) - (is (= '("Apache") (split-on-operators "Apache"))) - (is (= '("Apache MIT BSD") (split-on-operators "Apache MIT BSD"))) - (is (= '("ApacheandMIT") (split-on-operators "ApacheandMIT"))) - (is (= '("Apacheand MIT") (split-on-operators "Apacheand MIT"))) - (is (= '("Apache andMIT") (split-on-operators "Apache andMIT"))) - (is (= '("ApacheorMIT") (split-on-operators "ApacheorMIT"))) - (is (= '("Apacheor MIT") (split-on-operators "Apacheor MIT"))) - (is (= '("Apache orMIT") (split-on-operators "Apache orMIT"))) - (is (= '("ApachewithMIT") (split-on-operators "ApachewithMIT"))) - (is (= '("Apachewith MIT") (split-on-operators "Apachewith MIT"))) - (is (= '("Apache withMIT") (split-on-operators "Apache withMIT"))) - (is (= '("Apachew/MIT") (split-on-operators "Apachew/MIT"))) - (is (= '("Apachew/ MIT") (split-on-operators "Apachew/ MIT")))) - (testing "Simple and splits" - (is (= '("Apache" :and "MIT") (split-on-operators "Apache and MIT"))) - (is (= '("Apache" :and "MIT") (split-on-operators "Apache AND MIT"))) - (is (= '("Apache" :and "MIT") (split-on-operators "Apache aNd MIT"))) - (is (= '("Apache" :and "MIT") (split-on-operators "Apache & MIT"))) - (is (= '("Apache" :and "MIT") (split-on-operators "Apache &MIT"))) - (is (= '("Apache" :and "MIT") (split-on-operators "Apache&MIT")))) - (testing "Simple or splits" - (is (= '("Apache" :or "MIT") (split-on-operators "Apache or MIT"))) - (is (= '("Apache" :or "MIT") (split-on-operators "Apache OR MIT"))) - (is (= '("Apache" :or "MIT") (split-on-operators "Apache oR MIT")))) - (testing "Simple with splits" - (is (= '("Apache" :with "MIT") (split-on-operators "Apache with MIT"))) - (is (= '("Apache" :with "MIT") (split-on-operators "Apache WITH MIT"))) - (is (= '("Apache" :with "MIT") (split-on-operators "Apache wItH MIT"))) - (is (= '("Apache" :with "MIT") (split-on-operators "Apache w/ MIT"))) - (is (= '("Apache" :with "MIT") (split-on-operators "Apache w/MIT")))) - (testing "Complex non-splits" - (is (= '("COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0") (split-on-operators "COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0"))) - (is (= '("Copyright & all rights reserved Lean Pixel") (split-on-operators "Copyright & all rights reserved Lean Pixel"))) - (is (= '("GNU General Public License v3.0 or later") (split-on-operators "GNU General Public License v3.0 or later"))) - (is (= '("GNU General Public License, Version 3 (or later)") (split-on-operators "GNU General Public License, Version 3 (or later)"))) - (is (= '("GNU Lesser General Public License, version 2.1 or newer") (split-on-operators "GNU Lesser General Public License, version 2.1 or newer"))) - (is (= '("GNU General Public License, v2.0 or greater") (split-on-operators "GNU General Public License, v2.0 or greater"))) - (is (= '("GNU General Public License, version 3.0 or any later version") (split-on-operators "GNU General Public License, version 3.0 or any later version"))) - (is (= '("LGPL-3.0-or-later") (split-on-operators "LGPL-3.0-or-later"))))) diff --git a/test/lice_comb/impl/splitting_test.clj b/test/lice_comb/impl/splitting_test.clj new file mode 100644 index 0000000..622f640 --- /dev/null +++ b/test/lice_comb/impl/splitting_test.clj @@ -0,0 +1,201 @@ +; +; Copyright © 2024 Peter Monks +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +; +; SPDX-License-Identifier: Apache-2.0 +; + +(ns lice-comb.impl.splitting-test + (:require [clojure.string :as s] + [clojure.test :refer [deftest testing is use-fixtures]] + [spdx.licenses :as slic] + [spdx.exceptions :as sexc] + [lice-comb.test-boilerplate :refer [fixture]] + [lice-comb.impl.splitting :refer [strip-leading-and-trailing-operators split-on-operators]])) + +(use-fixtures :once fixture) + +(deftest strip-leading-and-trailing-operators-tests + (testing "Nil, blank, etc." + (is (nil? (strip-leading-and-trailing-operators nil))) + (is (nil? (strip-leading-and-trailing-operators ""))) + (is (nil? (strip-leading-and-trailing-operators " "))) + (is (nil? (strip-leading-and-trailing-operators " \n \r \t\t\t ")))) + (testing "Strings without any operators" + (is (= "Foo" (strip-leading-and-trailing-operators "Foo"))) + (is (= "hand" (strip-leading-and-trailing-operators "hand"))) + (is (= "android" (strip-leading-and-trailing-operators "android"))) + (is (= "ornery" (strip-leading-and-trailing-operators "ornery"))) + (is (= "stator" (strip-leading-and-trailing-operators "stator"))) + (is (= "withhold" (strip-leading-and-trailing-operators "withhold"))) + (is (= "The quick brown fox jumped over the lazy dogs." (strip-leading-and-trailing-operators "The quick brown fox jumped over the lazy dogs.")))) + (testing "Strings without leading or trailing operators" + (is (= "Foo and bar" (strip-leading-and-trailing-operators "Foo and bar"))) + (is (= "android or stator" (strip-leading-and-trailing-operators "android or stator"))) + (is (= "ornery or hand" (strip-leading-and-trailing-operators "ornery or hand"))) + (is (= "ornery and/or hand" (strip-leading-and-trailing-operators "ornery and/or hand"))) + (is (= "withhold and-or hand" (strip-leading-and-trailing-operators "withhold and-or hand"))) + (is (= "withhold and or with and/or and-or and\\or hand" (strip-leading-and-trailing-operators "withhold and or with and/or and-or and\\or hand")))) + (testing "Operators only" + (is (nil? (strip-leading-and-trailing-operators "and"))) + (is (nil? (strip-leading-and-trailing-operators "AND "))) + (is (nil? (strip-leading-and-trailing-operators " aNd"))) + (is (nil? (strip-leading-and-trailing-operators "or"))) + (is (nil? (strip-leading-and-trailing-operators " OR"))) + (is (nil? (strip-leading-and-trailing-operators "oR "))) + (is (nil? (strip-leading-and-trailing-operators " with"))) + (is (nil? (strip-leading-and-trailing-operators "WITH "))) + (is (nil? (strip-leading-and-trailing-operators "wItH"))) + (is (nil? (strip-leading-and-trailing-operators "and/or"))) + (is (nil? (strip-leading-and-trailing-operators " AND-OR "))) + (is (nil? (strip-leading-and-trailing-operators "and \n and"))) + (is (nil? (strip-leading-and-trailing-operators " or or "))) + (is (nil? (strip-leading-and-trailing-operators " and or with and/or with\tand or with and-or with or and ")))) + (testing "Leading operators" + (is (= "Foo" (strip-leading-and-trailing-operators "and Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "AND Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "aNd Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "or Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "OR Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "oR Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "with Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "WITH Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "wItH Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "w/ Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "and/or Foo "))) + (is (= "Foo" (strip-leading-and-trailing-operators " AND-OR \n\t Foo"))) + (is (= "Foo" (strip-leading-and-trailing-operators "and/or with and or and-or Foo")))) + (testing "Trailing operators" + (is (= "Foo" (strip-leading-and-trailing-operators "Foo and"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo AND"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo aNd"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo or"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo OR"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo oR"))) + (is (= "Foo" (strip-leading-and-trailing-operators " Foo with "))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo WITH"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo w/"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo \t\r wItH"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo and/or"))) + (is (= "Foo" (strip-leading-and-trailing-operators "Foo AND-OR "))) + (is (= "Foo" (strip-leading-and-trailing-operators " Foo and or and/or\nwith and-or\t"))))) + +(deftest split-on-operators-tests + (testing "Nil, blank, etc." + (is (nil? (split-on-operators nil))) + (is (nil? (split-on-operators ""))) + (is (nil? (split-on-operators " "))) + (is (nil? (split-on-operators " \n \r \t\t\t ")))) + (testing "Strings without any operators" + (is (= ["Foo"] (split-on-operators "Foo"))) + (is (= ["Apache"] (split-on-operators "Apache"))) + (is (= ["Apache MIT BSD"] (split-on-operators "Apache MIT BSD"))) + (is (= ["ApacheandMIT"] (split-on-operators "ApacheandMIT"))) + (is (= ["Apacheand MIT"] (split-on-operators "Apacheand MIT"))) + (is (= ["Apache andMIT"] (split-on-operators "Apache andMIT"))) + (is (= ["ApacheorMIT"] (split-on-operators "ApacheorMIT"))) + (is (= ["Apacheor MIT"] (split-on-operators "Apacheor MIT"))) + (is (= ["Apache orMIT"] (split-on-operators "Apache orMIT"))) + (is (= ["ApachewithMIT"] (split-on-operators "ApachewithMIT"))) + (is (= ["Apachewith MIT"] (split-on-operators "Apachewith MIT"))) + (is (= ["Apache withMIT"] (split-on-operators "Apache withMIT"))) + (is (= ["Apachew/MIT"] (split-on-operators "Apachew/MIT"))) + (is (= ["Apachew/ MIT"] (split-on-operators "Apachew/ MIT"))) + (is (= ["The quick brown fox jumped over the lazy dogs."] (split-on-operators "The quick brown fox jumped over the lazy dogs.")))) + (testing "Strings containing words that contain an operator, but which should not be split" + (is (= ["android"] (split-on-operators "android"))) + (is (= ["hand"] (split-on-operators "hand"))) + (is (= ["ornery"] (split-on-operators "ornery"))) + (is (= ["stator"] (split-on-operators "stator"))) + (is (= ["withhold"] (split-on-operators "withhold"))) + (is (= ["forthwith"] (split-on-operators "forthwith")))) + (testing "Simple and splits" + (is (= ["Apache" :and "MIT"] (split-on-operators "Apache and MIT"))) + (is (= ["Apache" :and "MIT"] (split-on-operators "Apache AND MIT"))) + (is (= ["Apache" :and "MIT"] (split-on-operators "Apache aNd MIT"))) + (is (= ["Apache" :and "MIT"] (split-on-operators " Apache & MIT"))) + (is (= ["Apache" :and "MIT"] (split-on-operators "Apache &MIT "))) + (is (= ["Apache" :and "MIT"] (split-on-operators " Apache&MIT ")))) + (testing "Simple or splits" + (is (= ["Apache" :or "MIT"] (split-on-operators "Apache or MIT"))) + (is (= ["Apache" :or "MIT"] (split-on-operators "Apache OR MIT"))) + (is (= ["Apache" :or "MIT"] (split-on-operators "Apache oR MIT"))) + (is (= ["MIT" :or "Lesser GPL"] (split-on-operators "MIT or Lesser GPL"))) + (is (= ["GNU Lesser" :or "MIT"] (split-on-operators "GNU Lesser OR MIT"))) + (is (= ["GNU Lesser" :or "Lesser GPL"] (split-on-operators "GNU Lesser OR Lesser GPL"))) ; This one is evil... + (is (= ["GNU Library" :or "Library GPL"] (split-on-operators "GNU Library OR Library GPL")))) ; Ditto + (testing "Simple with splits" + (is (= ["GPL" :with "CE"] (split-on-operators "GPL with CE "))) + (is (= ["Apache" :with "MIT"] (split-on-operators "Apache with MIT"))) + (is (= ["Apache" :with "MIT"] (split-on-operators "Apache WITH MIT"))) + (is (= ["Apache" :with "MIT"] (split-on-operators "Apache wItH MIT"))) + (is (= ["Apache" :with "MIT"] (split-on-operators "Apache w/ MIT"))) + (is (= ["Apache" :with "MIT"] (split-on-operators "Apache w/MIT")))) + (testing "Simple and/or splits" + (is (= ["MIT" "GPL"] (split-on-operators "MIT and/or GPL"))) + (is (= ["MIT" "GPL"] (split-on-operators " MIT and-or GPL "))) + (is (= ["MIT" "GPL"] (split-on-operators "MIT\nand\\or\nGPL")))) + (testing "Strings with multiple operators" + (is (= ["Apache" :and "MIT"] (split-on-operators "Apache and AND MIT"))) + (is (= ["Apache" :or "MIT"] (split-on-operators "Apache\nOR\noR\tMIT"))) + (is (= ["Apache" :and "MIT" :or "BSD"] (split-on-operators "\nApache\nAND\nMIT\nOR\nBSD\n\n\n\t"))) + (is (= ["Apache" :or "GPL" :with "CE"] (split-on-operators "Apache or GPL with CE")))) + (testing "Strings with multiple nonsensical operators (which get cleaned up elsewhere)" + (is (= ["EPL-2.0" :or "GPL-2.0-or-later" :with "Classpath Exception"] (split-on-operators "EPL-2.0 OR GPL-2.0-or-later WITH Classpath Exception"))) + (is (= ["Apache License 2.0" :with :or "MIT licence"] (split-on-operators "Apache License 2.0 with or MIT licence"))) + (is (= ["Apache Licence 2.0" :or :and :or :and :or :and :or :and "MIT"] (split-on-operators "or and Apache Licence 2.0 or and or and or and or and MIT and or and")))) + (testing "Exception cases for and" + (is (= ["COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0"] (split-on-operators "COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0"))) + (is (= ["COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0"] (split-on-operators "COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0"))) + (is (= ["Common Development & Distribution Licence Version 1.1"] (split-on-operators "Common Development & Distribution Licence Version 1.1"))) + (is (= ["Common Development & Distribution Licence Version 1.1"] (split-on-operators "Common Development & Distribution Licence Version 1.1"))) + (is (= ["Copyright & all rights reserved Lean Pixel"] (split-on-operators "Copyright & all rights reserved Lean Pixel"))) + (is (= ["Copyright and all rights reserved"] (split-on-operators "Copyright and all rights reserved"))) + (is (= ["BSD with attribution and HPND disclaimer"] (split-on-operators "BSD with attribution and HPND disclaimer"))) + (is (= ["HPND with US Government export control warning and acknowledgment"] (split-on-operators "HPND with US Government export control warning and acknowledgment"))) + (is (= ["HPND with US Government export control warning and modification rqmt"] (split-on-operators "HPND with US Government export control warning and modification rqmt"))) + (is (= ["Historical Permission Notice and Disclaimer"] (split-on-operators "Historical Permission Notice and Disclaimer"))) + (is (= ["IBM PowerPC Initialization and Boot Software"] (split-on-operators "IBM PowerPC Initialization and Boot Software"))) + (is (= ["LZMA SDK License (versions 9.22 and beyond)"] (split-on-operators "LZMA SDK License (versions 9.22 and beyond)"))) + (is (= ["Nara Institute of Science and Technology License (2003)"] (split-on-operators "Nara Institute of Science and Technology License (2003)"))) + (is (= ["Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)"] (split-on-operators "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)"))) + (is (= ["Unicode License Agreement - Data Files and Software"] (split-on-operators "Unicode License Agreement - Data Files and Software"))) + (is (= ["W3C Software Notice and License"] (split-on-operators "W3C Software Notice and License"))) + (is (= ["W3C Software Notice and Document License"] (split-on-operators "W3C Software Notice and Document License"))) + (is (= ["bzip2 and libbzip2 License"] (split-on-operators "bzip2 and libbzip2 License"))) + (is (= ["Creative Commons Attribution Non Commercial Share Alike 2.0 England and Wales"] (split-on-operators "Creative Commons Attribution Non Commercial Share Alike 2.0 England and Wales"))) + (is (= ["Creative Commons Attribution Share Alike 2.0 England and Wales"] (split-on-operators "Creative Commons Attribution Share Alike 2.0 England and Wales"))) + (is (= ["Creative Commons Public Domain Dedication and Certification"] (split-on-operators "Creative Commons Public Domain Dedication and Certification"))) + (is (= ["Academy of Motion Picture Arts and Sciences BSD"] (split-on-operators "Academy of Motion Picture Arts and Sciences BSD"))) + (is (= ["FSF Unlimited License (With License Retention and Warranty Disclaimer)"] (split-on-operators "FSF Unlimited License (With License Retention and Warranty Disclaimer)")))) + (testing "Exception cases for or" + (is (= ["GNU General Public License v3.0 or later"] (split-on-operators "GNU General Public License v3.0 or later"))) + (is (= ["GNU General Public License, Version 3 (or later)"] (split-on-operators "GNU General Public License, Version 3 (or later)"))) + (is (= ["GNU Lesser or Library General Public License, version 2.1"] (split-on-operators "GNU Lesser or Library General Public License, version 2.1"))) + (is (= ["GNU Lesser General Public License, version 2.1 or newer"] (split-on-operators "GNU Lesser General Public License, version 2.1 or newer"))) + (is (= ["GNU Lesser or Library General Public License, version 2.1 or newer"] (split-on-operators "GNU Lesser or Library General Public License, version 2.1 or newer"))) + (is (= ["GNU Library or Lesser General Public License, version 2.1 or newer"] (split-on-operators "GNU Library or Lesser General Public License, version 2.1 or newer"))) + (is (= ["GNU General Public License, v2.0 or greater"] (split-on-operators "GNU General Public License, v2.0 or greater"))) + (is (= ["GNU General Public License, version 3.0 or any later version"] (split-on-operators "GNU General Public License, version 3.0 or any later version"))) + (is (= ["LGPL-3.0-or-later"] (split-on-operators "LGPL-3.0-or-later")))) + (testing "Exception cases for with" + ;####TODO!!!! + ) + (testing "Cursed values" + (is (= ["CDDL" "GPLv2+CE"] (split-on-operators "CDDL/GPLv2+CE")))) + (testing "No splitting of any names in the SPDX license and exception lists" + (let [lic-names (map #(:name (slic/id->info %)) (slic/ids)) + exc-names (map #(:name (sexc/id->info %)) (sexc/ids))] + (is (every? true? (map #(= % (s/join (split-on-operators %))) lic-names))) + (is (every? true? (map #(= % (s/join (split-on-operators %))) exc-names)))))) diff --git a/test/lice_comb/matching_test.clj b/test/lice_comb/matching_test.clj index 11c8c3b..43d4360 100644 --- a/test/lice_comb/matching_test.clj +++ b/test/lice_comb/matching_test.clj @@ -118,12 +118,26 @@ (is (valid= #{"Apache-2.0 AND MIT"} (name->expressions "Apache & MIT licence"))) (is (valid= #{"CDDL-1.1"} (name->expressions "Common Development and Distribution Licence"))) (is (valid= #{"BSD-2-Clause-FreeBSD"} (name->expressions "BSD 2 clause freebsd"))) + (is (valid= #{"BSD-3-Clause-No-Nuclear-License"} (name->expressions "BSD 3 No Nuclear"))) + (is (valid= #{"BSD-3-Clause-No-Nuclear-License-2014"} (name->expressions "BSD 3 No Nuclear 2014"))) (is (valid= #{"GPL-1.0-only"} (name->expressions "GPL only"))) (is (valid= #{"GPL-1.0-or-later"} (name->expressions "GPL or later"))) (is (valid= #{"AGPL-1.0-only"} (name->expressions "AGPL only"))) (is (valid= #{"AGPL-1.0-or-later"} (name->expressions "AGPL or later"))) (is (valid= #{"LGPL-2.0-only"} (name->expressions "Lesser GPL only"))) (is (valid= #{"LGPL-2.0-or-later"} (name->expressions "Lesser GPL or later"))) + (is (valid= #{"LGPL-2.0-or-later"} (name->expressions "Lesser GPL"))) + (is (valid= #{"LGPL-2.0-or-later"} (name->expressions "Library GPL"))) + (is (valid= #{"LGPL-2.0-or-later"} (name->expressions "Lesser GNU"))) + (is (valid= #{"LGPL-2.0-or-later"} (name->expressions "Library GNU"))) + (is (valid= #{"LGPL-2.0-or-later"} (name->expressions "GNU lesser"))) + (is (valid= #{"LGPL-2.0-or-later"} (name->expressions "GNU library"))) + (is (valid= #{"LGPL-2.0-or-later OR MIT"} (name->expressions "MIT or Lesser General Public License"))) + (is (valid= #{"LGPL-2.0-or-later OR MIT"} (name->expressions "MIT or Library General Public License"))) + (is (valid= #{"LGPL-2.0-or-later OR MIT"} (name->expressions "Lesser General Public License or MIT"))) + (is (valid= #{"LGPL-2.0-or-later OR MIT"} (name->expressions "Library General Public License or MIT"))) + (is (valid= #{"LGPL-2.0-or-later OR MIT"} (name->expressions "GNU Lesser or MIT"))) + (is (valid= #{"LGPL-2.0-or-later OR MIT"} (name->expressions "GNU library or MIT"))) (is (valid= #{"BSD-2-Clause-Darwin"} (name->expressions "BSD 2 clause darwin"))) ; Since SPDX license list v3.23 (is (valid= #{"BSD-3-Clause-acpica"} (name->expressions "BSD 3 CLAUSE ACPICA"))) ; Since SPDX license list v3.23 (is (valid= #{(str (lcis/name->unidentified-license-ref "foo") " WITH Classpath-exception-2.0")} (name->expressions "foo with classpath exception"))) diff --git a/test/lice_comb/maven_test.clj b/test/lice_comb/maven_test.clj index 6c0cc4a..dbe7f48 100644 --- a/test/lice_comb/maven_test.clj +++ b/test/lice_comb/maven_test.clj @@ -50,7 +50,7 @@ (testing "Real pom files - remote" (is (valid= #{"Apache-2.0"} (pom->expressions "https://repo1.maven.org/maven2/software/amazon/ion/ion-java/1.0.2/ion-java-1.0.2.pom"))) (is (valid= #{(lcis/public-domain)} (pom->expressions "https://repo1.maven.org/maven2/aopalliance/aopalliance/1.0/aopalliance-1.0.pom"))) ; Note: non-SPDX - (is (valid= #{"EPL-1.0"} (pom->expressions "https://repo.clojars.org/org/clojure/clojure/1.4.0/clojure-1.4.0.pom"))) + (is (valid= #{"EPL-1.0"} (pom->expressions "https://repo1.maven.org/maven2/org/clojure/clojure/1.4.0/clojure-1.4.0.pom"))) (is (valid= #{"Apache-2.0"} (pom->expressions "https://repo.clojars.org/com/github/pmonks/asf-cat/1.0.12/asf-cat-1.0.12.pom"))) (is (valid= #{"Apache-2.0"} (pom->expressions "https://repo.clojars.org/http-kit/http-kit/2.5.3/http-kit-2.5.3.pom"))) (is (nil? (pom->expressions "https://repo.clojars.org/borkdude/sci.impl.reflector/0.0.1/sci.impl.reflector-0.0.1.pom"))) ; This project has no license information in its pom @@ -81,7 +81,7 @@ (is (valid= #{"EPL-2.0"} (gav->expressions "quil" "quil" "4.3.1323"))) ; Clojars (is (valid= #{"EPL-2.0"} (gav->expressions "quil" "quil" "4.3.1426-5368295-SNAPSHOT"))) ; Clojars, SNAPSHOT (is (valid= #{"EPL-2.0"} (gav->expressions "quil" "quil" "4.3.1426-5368295-snapshot"))) ; Clojars, SNAPSHOT lower case - (is (valid= #{"EPL-1.0"} (gav->expressions "org.clojure" "clojure" "1.11.1"))) ; Maven Central + (is (valid= #{"EPL-1.0"} (gav->expressions "org.clojure" "clojure" "1.11.3"))) ; Maven Central (is (valid= #{"EPL-1.0"} (gav->expressions "org.clojure" "clojure" "RELEASE"))) ; Maven Central, RELEASE version (is (valid= #{"EPL-1.0"} (gav->expressions "org.clojure" "clojure" "1.12.0-alpha5"))) ; Maven Central, custom suffix (is (valid= #{"Apache-2.0"} (gav->expressions "org.springframework" "spring-core" "6.1.0"))) ; Maven Central diff --git a/test/lice_comb/test_boilerplate.clj b/test/lice_comb/test_boilerplate.clj index 50f0da4..aee4b10 100644 --- a/test/lice_comb/test_boilerplate.clj +++ b/test/lice_comb/test_boilerplate.clj @@ -31,6 +31,11 @@ (def ^:private global-setup (delay ; Enable spec validation (spec/check-asserts true) + ; Initialise clj-spdx (since this can be slow) + (println "ℹ️ Initialising clj-spdx...") + (sexp/init!) + (println "ℹ️ clj-spdx initialised.") + (flush) nil)) (defn fixture