diff --git a/paper/paper.bib b/paper/paper.bib index 641674d..ff8b13a 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -230,18 +230,63 @@ @article{Zhu:2022 url = {http://jmlr.org/papers/v23/21-1060.html} } -@article{Barroso-Luque:2022, - title = {Cluster Expansions of Multicomponent Ionic Materials: {{Formalism}} and Methodology}, - shorttitle = {Cluster Expansions of Multicomponent Ionic Materials}, - author = {Barroso-Luque, Luis and Zhong, Peichen and Yang, Julia H. and Xie, Fengyu and Chen, Tina and Ouyang, Bin and Ceder, Gerbrand}, - date = {2022-10-12}, - journaltitle = {Physical Review B}, - shortjournal = {Phys. Rev. B}, - volume = {106}, - number = {14}, - pages = {144202}, - publisher = {{American Physical Society}}, - doi = {10.1103/PhysRevB.106.144202}, +@article{Athey:2017, + title = {The {{State}} of {{Applied Econometrics}}: {{Causality}} and {{Policy Evaluation}}}, + author = {Athey, Susan and Imbens, Guido W.}, + year = {2017}, + journal = {Journal of Economic Perspectives}, + volume = {31}, + number = {2}, + pages = {3--32}, + issn = {0895-3309}, + doi = {10.1257/jep.31.2.3}, +} + +@inproceedings{Chen:2021, + title = {Gene {{Selection}} from {{Biological Data}} via {{Group Lasso}} for {{Logistic Regression Model}}: {{Effects}} of {{Different Clustering Algorithms}}}, + author = {Chen, Shunjie and Wang, Pei}, + year = {2021}, + pages = {6374--6379}, + issn = {1934-1768}, + doi = {10.23919/CCC52363.2021.9549471}, +} + +@article{Kim:2012, + title = {Analysis of {{Survival Data}} with {{Group Lasso}}}, + author = {Kim, Jinseog and Sohn, Insuk and Jung, Sin-Ho and Kim, Sujong and Park, Changyi}, + year = {2012}, + journaltitle = {Communications in Statistics - Simulation and Computation}, + volume = {41}, + number = {9}, + pages = {1593--1605}, + publisher = {{Taylor \& Francis}}, + issn = {0361-0918}, + doi = {10.1080/03610918.2011.611311}, +} + +@article{Gu:2018, + title = {Thermochemistry of Gas-Phase and Surface Species via {{LASSO-assisted}} Subgraph Selection}, + author = {Gu, Geun Ho and Plechac, Petr and Vlachos, Dionisios G.}, + journaltitle = {Reaction Chemistry \& Engineering}, + year = {2018}, + volume = {3}, + number = {4}, + pages = {454--466}, + publisher = {{The Royal Society of Chemistry}}, + issn = {2058-9883}, + doi = {10.1039/C7RE00210F}, +} + +@article{Ma:2007, + title = {Supervised Group {{Lasso}} with Applications to Microarray Data Analysis}, + author = {Ma, Shuangge and Song, Xiao and Huang, Jian}, + year = {2007}, + journaltitle = {BMC Bioinformatics}, + volume = {8}, + number = {1}, + pages = {60}, + issn = {1471-2105}, + doi = {10.1186/1471-2105-8-60}, } @article{Leong:2019, @@ -256,3 +301,50 @@ @article{Leong:2019 doi = {10.1103/PhysRevB.100.134108}, urldate = {2020-04-29} } + +@article{Xie:2023, + title = {Semigrand-Canonical {{Monte-Carlo}} Simulation Methods for Charge-Decorated Cluster Expansions}, + author = {Xie, Fengyu and Zhong, Peichen and Barroso-Luque, Luis and Ouyang, Bin and Ceder, Gerbrand}, + year = {2023}, + journaltitle = {Computational Materials Science}, + volume = {218}, + pages = {112000}, + issn = {0927-0256}, + doi = {10.1016/j.commatsci.2022.112000}, +} + +@article{Zhong:2022, + title = {An \$\{\textbackslash ensuremath\{\textbackslash ell\}\}\_\{0\}\{\textbackslash ensuremath\{\textbackslash ell\}\}\_\{2\}\$-Norm Regularized Regression Model for Construction of Robust Cluster Expansions in Multicomponent Systems}, + author = {Zhong, Peichen and Chen, Tina and Barroso-Luque, Luis and Xie, Fengyu and Ceder, Gerbrand}, + year = {2022}, + journaltitle = {Physical Review B}, + volume = {106}, + number = {2}, + pages = {024203}, + publisher = {{American Physical Society}}, + doi = {10.1103/PhysRevB.106.024203}, +} + +@article{Zhong:2023, + title = {Modeling {{Intercalation Chemistry}} with {{Multiredox Reactions}} by {{Sparse Lattice Models}} in {{Disordered Rocksalt Cathodes}}}, + author = {Zhong, Peichen and Xie, Fengyu and Barroso-Luque, Luis and Huang, Liliang and Ceder, Gerbrand}, + year = {2023}, + journaltitle = {PRX Energy}, + volume = {2}, + number = {4}, + pages = {043005}, + publisher = {{American Physical Society}}, + doi = {10.1103/PRXEnergy.2.043005}, +} + +@article{Barroso-Luque:2022, + title = {Cluster Expansions of Multicomponent Ionic Materials: {{Formalism}} and Methodology}, + author = {Barroso-Luque, Luis and Zhong, Peichen and Yang, Julia H. and Xie, Fengyu and Chen, Tina and Ouyang, Bin and Ceder, Gerbrand}, + year = {2022}, + journaltitle = {Physical Review B}, + volume = {106}, + number = {14}, + pages = {144202}, + publisher = {{American Physical Society}}, + doi = {10.1103/PhysRevB.106.144202}, +} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index b91357c..84e7e98 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -33,14 +33,13 @@ resulting in sparse linear models such as the Lasso [@Tibshirani:1996; @Zou:2006 Best Subset Selection [@Hocking:1967] have been widely used in a variety of fields. However, many regression problems involve covariates that have a natural underlying structure, such as group or hierarchical relationships between covariates, that can be -leveraged to obtain improved model performance and interpretability. A common example of -linear regression problems with sparsity structure occurs in chemistry and materials -science when fitting multi-body expansions that involve a hierarchy among the main -effects from chemical composition and higher order corrections -aiming to capture the effects of chemical interactions [@Leong:2019; @Barroso-Luque:2022]. -Several generalizations of the Lasso [@Yuan:2006; @Friedman:2010; @Simon:2013; @Wang:2019] -and Best Subset Selection [@Bertsimas:2016-a; @Bertsimas:2016-b] have been developed to -effectively exploit additional structure in linear regression. +leveraged to obtain improved model performance and interpretability, such problems occur +in a wide range of fields including genomics [@Chen:2021], bioinformatics [@Ma:2007], +medicine [@Kim:2012], econometrics [@Athey:2017], chemistry [@Gu:2018], and materials +science [@Leong:2019]. Several generalizations of the Lasso +[@Yuan:2006; @Friedman:2010; @Simon:2013; @Wang:2019] and Best Subset Selection +[@Bertsimas:2016-a; @Bertsimas:2016-b] have been developed to effectively exploit +additional structure in linear regression. # Statement of need @@ -69,7 +68,7 @@ pseudo-norm regularization. The pre-existing packages mentioned include highly performant implementations of the specific models they implement. However, none of these packages implement the full range of sparse linear models available in `sparse-lm`, nor do they support the flexibility -to modify the optimization objective and choose among many open-source and commerically +to modify the optimization objective and choose among many open-source and commercially available solvers. `sparse-lm` satisfies the need for a flexible and comprehensive library that enables easy experimentation and comparisons of different sparse linear regression algorithms within a single package. @@ -138,6 +137,18 @@ introduce hierarchical structure into the model. Finally, we have also included $\ell_2$ regularization term controlled by the hyperparameter $\lambda$, which is useful when dealing with poorly conditioned design matrices. +Statistical regression models with structured sparsity (involving grouped covariates, +sparse grouped covariates, and hierarchical relationships between covariates terms) +parametrized via Group Lasso or Best Subset Selection based objetives have been used in a +wide range of scientific disciplines, including genomics [@Chen:2021], bioinformatics [@Ma:2007], +medicine [@Kim:2012], econometrics [@Athey:2017], chemistry [@Gu:2018], and materials science +[@Leong:2019]. The flexible implementation of sparse linear regression models in `sparse-lm` +allows researchers to easily experiment and choose the best regression model for their +specific problem. `sparse-lm` has already been used to build linear models with +structured sparsity in a handful of material science studies +[@Barroso-Luque:2022; @Zhong:2022; @Xie:2023, @Zhong:2023]. + + # Usage Since the linear regression models in `sparse-lm` are implemented to be compatible with @@ -153,18 +164,20 @@ options are implemented. The implemented models are listed below: The table below shows the regression models that are implemented in `sparse-lm` as well as available implementations in other Python packages. $\checkmark$ indicates that the -| Model | `sparse-lm` | `celer` | `groupyr` | `group-lasso` | `skglm` | `abess` | -|:-----------------------------:|:---------------:|:---------:|:-----------:|:------------------:|:----------:|:--------:| -| (Adaptive) Lasso | $\checkmark$️ | $\checkmark$️ | | | $\checkmark$️ | ️ | -| (Adaptive) Group Lasso | $\checkmark$️ | $\checkmark$️ | $\checkmark$️ | $\checkmark$️ | $\checkmark$ | ️ | -| (Adaptive) Sparse Group Lasso | $\checkmark$️ | | $\checkmark$️ | $\checkmark$️ | $\checkmark$ | ️ | -| (Adaptive) Ridged Group Lasso | $\checkmark$️ | | | | $\checkmark$ | | -| Best Subset Selection | $\checkmark$️ | | | | | ️ | -| Ridged Best Subset Selection | $\checkmark$️ | | | | | ️ | -| $\ell_0$ pseudo-norm | $\checkmark$️ | | | | | ️ | -| $\ell_0\ell_2$ mixed-norm | $\checkmark$️ | | | | | | - -Note that only `sparse-lm` includes adaptive versions of Lasso estimators. However, some of the third party packages, +| Model | `sparse-lm` | `celer` | `groupyr` | `group-lasso` | `skglm` | `abess` | +|:-----------------------------:|:------------:|:---------:|:-----------:|:-----------:|:------------:|:--------:| +| (Adaptive) Lasso | $\checkmark$️ | $\checkmark$️ | | | $\checkmark$️ | ️ | +| (Adaptive) Group Lasso | $\checkmark$️ | $\checkmark$️ | $\checkmark$️ | $\checkmark$️ | $\checkmark$ | ️ | +| (Adaptive) Sparse Group Lasso | $\checkmark$️ | | $\checkmark$️ | $\checkmark$️ | $\checkmark$ | ️ | +| (Adaptive) Ridged Group Lasso | $\checkmark$️ | | | | $\checkmark$ | | +| Best Subset Selection | $\checkmark$️ | | | | | ️ | +| Ridged Best Subset Selection | $\checkmark$️ | | | | | ️ | +| $\ell_0$ pseudo-norm | $\checkmark$️ | | | | | ️ | +| $\ell_0\ell_2$ mixed-norm | $\checkmark$️ | | | | | | +| $\ell_{1/2}$ psuedo-norm | | | | | $\checkmark$ | | +| $\ell_{2/3}$ psuedo-norm | | | | | $\checkmark$ | | + +Note that only `sparse-lm` includes adaptive versions of Lasso based estimators. However, some of the third party packages, notably `skglm` and `abess`, include additional penalties and regression objectives that are not implemented in `sparse-lm`. ## Implemented model selection and composition tools