From 1082339e70c5922cb382e60bfa25a5849da82f9b Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tiago.oliveira@sandboxquantum.com>
Date: Fri, 19 Apr 2024 10:24:20 +0100
Subject: [PATCH 01/19] fix keccak ref1 remove spill

---
 .../keccak1600/amd64/ref1/keccakf1600.jinc    |  5 +---
 .../kyber/kyber768/amd64/ref/indcpa.jinc      | 28 +++++++++++++++++++
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
index 85bbfd40..e261b30b 100644
--- a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
@@ -130,23 +130,20 @@ inline fn __round_ref1(reg ptr u64[25] e a, reg u64 rc) -> reg ptr u64[25]
 inline fn __keccakf1600_ref1(reg ptr u64[25] a) -> reg ptr u64[25]
 {
   reg ptr u64[24] RC;
-  stack ptr u64[24] s_RC;
   stack u64[25] s_e;
   reg ptr u64[25] e;
+
   reg u64 c rc;
 
   RC = KECCAK1600_RC;
-  s_RC = RC;
   e = s_e;
 
   c = 0;
   while (c < KECCAK_ROUNDS - 1)
   {
-    RC = s_RC;
     rc = RC[(int) c];
     e = __round_ref1(e, a, rc);
 
-    RC = s_RC;
     rc = RC[(int) c + 1];
     a = __round_ref1(a, e, rc);
 
diff --git a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
index 5e0ac756..34c8982f 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
@@ -98,6 +98,9 @@ fn __indcpa_enc(stack u64 sctp, reg ptr u8[32] msgp, reg u64 pkp, reg ptr u8[KYB
   reg u64 ctp;
   reg u16 t;
   reg u8 nonce;
+  stack ptr u8[KYBER_SYMBYTES] noiseseed_s;
+
+  noiseseed_s = noiseseed;
 
   pkpv = __polyvec_frombytes(pkp);
 
@@ -116,20 +119,31 @@ fn __indcpa_enc(stack u64 sctp, reg ptr u8[32] msgp, reg u64 pkp, reg ptr u8[KYB
 
   aat = __gen_matrix(publicseed, 1);
 
+  noiseseed = noiseseed_s;
   nonce = 0;
   sp[0:KYBER_N] = _poly_getnoise(sp[0:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 1;
   sp[KYBER_N:KYBER_N] = _poly_getnoise(sp[KYBER_N:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 2;
   sp[2*KYBER_N:KYBER_N] = _poly_getnoise(sp[2*KYBER_N:KYBER_N], noiseseed, nonce);
 
+  noiseseed = noiseseed_s;
   nonce = 3;
   ep[0:KYBER_N] = _poly_getnoise(ep[0:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 4;
   ep[KYBER_N:KYBER_N] = _poly_getnoise(ep[KYBER_N:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 5;
   ep[2*KYBER_N:KYBER_N] = _poly_getnoise(ep[2*KYBER_N:KYBER_N], noiseseed, nonce);
 
+  noiseseed = noiseseed_s;
   nonce = 6;
   epp = _poly_getnoise(epp, noiseseed, nonce);
 
@@ -167,6 +181,9 @@ fn __iindcpa_enc(reg ptr u8[KYBER_CT_LEN] ctp, reg ptr u8[32] msgp, reg u64 pkp,
   reg u16 t;
   reg u8 nonce;
   stack ptr u8[KYBER_CT_LEN] sctp;
+  stack ptr u8[KYBER_SYMBYTES] noiseseed_s;
+
+  noiseseed_s = noiseseed;
 
   sctp = ctp;
 
@@ -187,20 +204,31 @@ fn __iindcpa_enc(reg ptr u8[KYBER_CT_LEN] ctp, reg ptr u8[32] msgp, reg u64 pkp,
 
   aat = __gen_matrix(publicseed, 1);
 
+  noiseseed = noiseseed_s;
   nonce = 0;
   sp[0:KYBER_N] = _poly_getnoise(sp[0:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 1;
   sp[KYBER_N:KYBER_N] = _poly_getnoise(sp[KYBER_N:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 2;
   sp[2*KYBER_N:KYBER_N] = _poly_getnoise(sp[2*KYBER_N:KYBER_N], noiseseed, nonce);
 
+  noiseseed = noiseseed_s;
   nonce = 3;
   ep[0:KYBER_N] = _poly_getnoise(ep[0:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 4;
   ep[KYBER_N:KYBER_N] = _poly_getnoise(ep[KYBER_N:KYBER_N], noiseseed, nonce);
+
+  noiseseed = noiseseed_s;
   nonce = 5;
   ep[2*KYBER_N:KYBER_N] = _poly_getnoise(ep[2*KYBER_N:KYBER_N], noiseseed, nonce);
 
+  noiseseed = noiseseed_s;
   nonce = 6;
   epp = _poly_getnoise(epp, noiseseed, nonce);
 

From 7e65815ece4c5a0dcc993dfdb054d0a72763bb77 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Mon, 29 Apr 2024 08:53:55 +0100
Subject: [PATCH 02/19] libjade: update to dual license

---
 LICENSE                 | 122 +-----------------------
 LICENSES/Apache-2.0.txt | 202 ++++++++++++++++++++++++++++++++++++++++
 LICENSES/CC0-1.0.txt    | 121 ++++++++++++++++++++++++
 3 files changed, 324 insertions(+), 121 deletions(-)
 create mode 100644 LICENSES/Apache-2.0.txt
 create mode 100644 LICENSES/CC0-1.0.txt

diff --git a/LICENSE b/LICENSE
index 0e259d42..59ff3e16 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,121 +1 @@
-Creative Commons Legal Code
-
-CC0 1.0 Universal
-
-    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
-    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
-    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
-    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
-    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
-    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
-    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
-    HEREUNDER.
-
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer
-exclusive Copyright and Related Rights (defined below) upon the creator
-and subsequent owner(s) (each and all, an "owner") of an original work of
-authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for
-the purpose of contributing to a commons of creative, cultural and
-scientific works ("Commons") that the public can reliably and without fear
-of later claims of infringement build upon, modify, incorporate in other
-works, reuse and redistribute as freely as possible in any form whatsoever
-and for any purposes, including without limitation commercial purposes.
-These owners may contribute to the Commons to promote the ideal of a free
-culture and the further production of creative, cultural and scientific
-works, or to gain reputation or greater distribution for their Work in
-part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any
-expectation of additional consideration or compensation, the person
-associating CC0 with a Work (the "Affirmer"), to the extent that he or she
-is an owner of Copyright and Related Rights in the Work, voluntarily
-elects to apply CC0 to the Work and publicly distribute the Work under its
-terms, with knowledge of his or her Copyright and Related Rights in the
-Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be
-protected by copyright and related or neighboring rights ("Copyright and
-Related Rights"). Copyright and Related Rights include, but are not
-limited to, the following:
-
-  i. the right to reproduce, adapt, distribute, perform, display,
-     communicate, and translate a Work;
- ii. moral rights retained by the original author(s) and/or performer(s);
-iii. publicity and privacy rights pertaining to a person's image or
-     likeness depicted in a Work;
- iv. rights protecting against unfair competition in regards to a Work,
-     subject to the limitations in paragraph 4(a), below;
-  v. rights protecting the extraction, dissemination, use and reuse of data
-     in a Work;
- vi. database rights (such as those arising under Directive 96/9/EC of the
-     European Parliament and of the Council of 11 March 1996 on the legal
-     protection of databases, and under any national implementation
-     thereof, including any amended or successor version of such
-     directive); and
-vii. other similar, equivalent or corresponding rights throughout the
-     world based on applicable law or treaty, and any national
-     implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention
-of, applicable law, Affirmer hereby overtly, fully, permanently,
-irrevocably and unconditionally waives, abandons, and surrenders all of
-Affirmer's Copyright and Related Rights and associated claims and causes
-of action, whether now known or unknown (including existing as well as
-future claims and causes of action), in the Work (i) in all territories
-worldwide, (ii) for the maximum duration provided by applicable law or
-treaty (including future time extensions), (iii) in any current or future
-medium and for any number of copies, and (iv) for any purpose whatsoever,
-including without limitation commercial, advertising or promotional
-purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
-member of the public at large and to the detriment of Affirmer's heirs and
-successors, fully intending that such Waiver shall not be subject to
-revocation, rescission, cancellation, termination, or any other legal or
-equitable action to disrupt the quiet enjoyment of the Work by the public
-as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason
-be judged legally invalid or ineffective under applicable law, then the
-Waiver shall be preserved to the maximum extent permitted taking into
-account Affirmer's express Statement of Purpose. In addition, to the
-extent the Waiver is so judged Affirmer hereby grants to each affected
-person a royalty-free, non transferable, non sublicensable, non exclusive,
-irrevocable and unconditional license to exercise Affirmer's Copyright and
-Related Rights in the Work (i) in all territories worldwide, (ii) for the
-maximum duration provided by applicable law or treaty (including future
-time extensions), (iii) in any current or future medium and for any number
-of copies, and (iv) for any purpose whatsoever, including without
-limitation commercial, advertising or promotional purposes (the
-"License"). The License shall be deemed effective as of the date CC0 was
-applied by Affirmer to the Work. Should any part of the License for any
-reason be judged legally invalid or ineffective under applicable law, such
-partial invalidity or ineffectiveness shall not invalidate the remainder
-of the License, and in such case Affirmer hereby affirms that he or she
-will not (i) exercise any of his or her remaining Copyright and Related
-Rights in the Work or (ii) assert any associated claims and causes of
-action with respect to the Work, in either case contrary to Affirmer's
-express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
- a. No trademark or patent rights held by Affirmer are waived, abandoned,
-    surrendered, licensed or otherwise affected by this document.
- b. Affirmer offers the Work as-is and makes no representations or
-    warranties of any kind concerning the Work, express, implied,
-    statutory or otherwise, including without limitation warranties of
-    title, merchantability, fitness for a particular purpose, non
-    infringement, or the absence of latent or other defects, accuracy, or
-    the present or absence of errors, whether or not discoverable, all to
-    the greatest extent permissible under applicable law.
- c. Affirmer disclaims responsibility for clearing rights of other persons
-    that may apply to the Work or any use thereof, including without
-    limitation any person's Copyright and Related Rights in the Work.
-    Further, Affirmer disclaims responsibility for obtaining any necessary
-    consents, permissions or other rights required for any use of the
-    Work.
- d. Affirmer understands and acknowledges that Creative Commons is not a
-    party to this document and has no duty or obligation with respect to
-    this CC0 or use of the Work.
+SPDX-License-Identifier: CC0-1.0 OR Apache-2.0
diff --git a/LICENSES/Apache-2.0.txt b/LICENSES/Apache-2.0.txt
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/LICENSES/Apache-2.0.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LICENSES/CC0-1.0.txt b/LICENSES/CC0-1.0.txt
new file mode 100644
index 00000000..0e259d42
--- /dev/null
+++ b/LICENSES/CC0-1.0.txt
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.

From 70eb190a54e657d3c21f85afab0f57daf006ffee Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Tue, 25 Jun 2024 09:56:46 +0100
Subject: [PATCH 03/19] sct: towards jasmin-ct

---
 src/Makefile          | 4 +++-
 src/Makefile.checksct | 9 +++++----
 src/Makefile.common   | 3 +++
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1001829f..80e48e29 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -6,7 +6,9 @@ CC      ?= clang
 CFLAGS  ?= -O3 -Wall -Wextra -Wpedantic -Wvla -Werror -std=c99 \
 	         -Wundef -Wshadow -Wcast-align -Wpointer-arith -Wmissing-prototypes \
 	         -fstrict-aliasing -fno-common -pipe
-JASMIN  ?= jasminc
+
+JASMIN    ?= jasminc
+JASMIN_CT ?= jasmin-ct
 
 # --------------------------------------------------------------------
 CI      ?= 0
diff --git a/src/Makefile.checksct b/src/Makefile.checksct
index 7a1bfc0c..d2a2ce09 100644
--- a/src/Makefile.checksct
+++ b/src/Makefile.checksct
@@ -5,10 +5,11 @@
 
 ifneq ($(OP),)
 
-SCT_FLAGS  ?= 
+# TODO: remove --infer
+SCT_FLAGS  ?= --infer
 
-CHECK_SCT_S = ($(JASMINC) -slice $* -checkSCT $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
-CHECK_SCT   = ($(JASMINC)           -checkSCT $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
+CHECK_SCT_SLICE = ($(JASMIN_CT) $(JINCLUDE) -slice $* --sct $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
+CHECK_SCT       = ($(JASMIN_CT) $(JINCLUDE)           --sct $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
 
 SCT_TARGETS  = $(addsuffix .sct, $(FUNCTIONS))
 
@@ -21,7 +22,7 @@ $(OP).sct : $(OP).jazz $(DEPS_DIR)/$(OP).sct.d | $(DEPS_DIR) $(CI_DIR)
 $(SCT_TARGETS):
 %.sct : $(OP).jazz $(DEPS_DIR)/%.sct.d | $(DEPS_DIR) $(CI_DIR)
 	$(DEPS)
-	$(CHECK_SCT_S)
+	$(CHECK_SCT_SLICE)
 
 DEPFILES := \
  $(DEPFILES) \
diff --git a/src/Makefile.common b/src/Makefile.common
index ab28f62b..0606c32d 100644
--- a/src/Makefile.common
+++ b/src/Makefile.common
@@ -35,10 +35,13 @@ endif
 JEXT    ?= jazz
 override JFLAGS += -noinsertarraycopy
 JINCLUDE = -I Jade:$(SRC)
+
 JASMIN  ?= jasminc
 JASMINC := $(JASMIN) $(JFLAGS) $(JINCLUDE)
 COMPILE  = ($(JASMINC) -o $@ $<) $(CIT)
 
+JASMIN_CT ?= jasmin-ct
+
 # --------------------------------------------------------------------
 include $(SRC)/$(OPERATION)/EcFlags.mk
 

From 3a6c9b41a3248a2c6f2c1a6131c34de0d70b793b Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Wed, 26 Jun 2024 07:05:27 +0100
Subject: [PATCH 04/19] sct: towards jasmin-ct (2)

---
 src/Makefile          |  7 +++----
 src/Makefile.checksct | 24 +++++++++++++++++++-----
 src/Makefile.common   |  2 --
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 80e48e29..27d7314d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -24,7 +24,7 @@ SRC     := .
 FILTER  ?= $(SRC)/crypto_%
 JAZZ    ?=  $(filter $(FILTER), $(filter-out $(addprefix ./,$(EXCLUDE)), $(sort $(dir $(shell find $(SRC) -name '*.jazz')))))
 SAFETY  ?= $(addsuffix safety, $(JAZZ))
-SCT     ?= $(addsuffix sct, $(JAZZ))
+SCT     ?= $(addsuffix check_sct, $(JAZZ))
 
 SOURCES ?= $(filter-out ./, $(sort $(dir $(shell find $(SRC) -name 'Makefile'))))
 ASM     := $(shell find $(SRC) -name '*.s')
@@ -74,9 +74,8 @@ $(SAFETY):
 	$(MAKE) -C $(@D) $(@F) || true
 
 # --------------------------------------------------------------------
-
-.PHONY: sct
-sct: $(SCT)
+.PHONY: check_sct
+check_sct: $(SCT)
 
 $(SCT):
 	$(MAKE) -C $(@D) $(@F) || true
diff --git a/src/Makefile.checksct b/src/Makefile.checksct
index d2a2ce09..2168ccfb 100644
--- a/src/Makefile.checksct
+++ b/src/Makefile.checksct
@@ -3,27 +3,41 @@
 #   functions
 # - it is meant to be included by Makefile.common
 
+# JASMIN_CT belongs here (and not Makefile.common): some options differ from jasminc
+JASMIN_CT ?= jasmin-ct
+
 ifneq ($(OP),)
 
-# TODO: remove --infer
+# TODO: remove --infer and annotate exported functions
 SCT_FLAGS  ?= --infer
 
-CHECK_SCT_SLICE = ($(JASMIN_CT) $(JINCLUDE) -slice $* --sct $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
-CHECK_SCT       = ($(JASMIN_CT) $(JINCLUDE)           --sct $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
+CHECK_SCT_SLICE        = (JASMINPATH="Jade=$(SRC)" $(JASMIN_CT) --slice $* --sct $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
+CHECK_SCT_SLICE_STDOUT = (JASMINPATH="Jade=$(SRC)" $(JASMIN_CT) --slice $* --sct $(SCT_FLAGS) $<          ) $(CIT)
+
+CHECK_SCT              = (JASMINPATH="Jade=$(SRC)" $(JASMIN_CT)            --sct $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
+CHECK_SCT_STDOUT       = (JASMINPATH="Jade=$(SRC)" $(JASMIN_CT)            --sct $(SCT_FLAGS) $<          ) $(CIT)
 
-SCT_TARGETS  = $(addsuffix .sct, $(FUNCTIONS))
+SCT_TARGETS         = $(addsuffix .sct, $(FUNCTIONS))
+SCT_TARGETS_STDOUT  = $(addsuffix .stdout, $(SCT_TARGETS))
 
-sct: $(SCT_TARGETS)
+check_sct: $(SCT_TARGETS)
 
 $(OP).sct : $(OP).jazz $(DEPS_DIR)/$(OP).sct.d | $(DEPS_DIR) $(CI_DIR)
 	$(DEPS)
 	$(CHECK_SCT)
 
+$(OP).sct.stdout : $(OP).jazz | $(CI_DIR)
+	$(CHECK_SCT_STDOUT)
+
 $(SCT_TARGETS):
 %.sct : $(OP).jazz $(DEPS_DIR)/%.sct.d | $(DEPS_DIR) $(CI_DIR)
 	$(DEPS)
 	$(CHECK_SCT_SLICE)
 
+$(SCT_TARGETS_STDOUT):
+%.sct.stdout : $(OP).jazz | $(CI_DIR)
+	$(CHECK_SCT_SLICE_STDOUT)
+
 DEPFILES := \
  $(DEPFILES) \
  $(addprefix $(DEPS_DIR)/, $(addsuffix .sct.d, $(FUNCTIONS) $(OP)))
diff --git a/src/Makefile.common b/src/Makefile.common
index 0606c32d..9b1b272f 100644
--- a/src/Makefile.common
+++ b/src/Makefile.common
@@ -40,8 +40,6 @@ JASMIN  ?= jasminc
 JASMINC := $(JASMIN) $(JFLAGS) $(JINCLUDE)
 COMPILE  = ($(JASMINC) -o $@ $<) $(CIT)
 
-JASMIN_CT ?= jasmin-ct
-
 # --------------------------------------------------------------------
 include $(SRC)/$(OPERATION)/EcFlags.mk
 

From cf0920b16f51009e9a198b3874de0a50447890a3 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Wed, 26 Jun 2024 07:43:26 +0100
Subject: [PATCH 05/19] sct: crypto_hash/sha256/amd64/ref

---
 src/crypto_hash/sha256/amd64/ref/hash.jazz   | 3 +++
 src/crypto_hash/sha256/amd64/ref/sha256.jinc | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/crypto_hash/sha256/amd64/ref/hash.jazz b/src/crypto_hash/sha256/amd64/ref/hash.jazz
index bed68245..2a04a350 100644
--- a/src/crypto_hash/sha256/amd64/ref/hash.jazz
+++ b/src/crypto_hash/sha256/amd64/ref/hash.jazz
@@ -3,6 +3,9 @@ require "sha256.jinc"
 export fn jade_hash_sha256_amd64_ref(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha256_ref(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha256/amd64/ref/sha256.jinc b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
index fa7497e4..c5a83de4 100644
--- a/src/crypto_hash/sha256/amd64/ref/sha256.jinc
+++ b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
@@ -192,7 +192,7 @@ fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64
   stack ptr u32[8] Hp;
   reg ptr u32[8] H;
   reg u64 tr;
-  stack u64 in_s;
+  #mmx reg u64 in_s;
 
   Kp = SHA256_K;
   Hp = _H;
@@ -275,9 +275,9 @@ fn _blocks_1_ref(reg ptr u32[8] _H, reg ptr u32[32] sblocks, reg u64 nblocks) ->
   reg ptr u32[64] Kp;
   stack ptr u32[8] Hp;
   reg ptr u32[8] H;
-  stack ptr u32[32] s_sblocks;
+  #mmx reg ptr u32[32] s_sblocks;
   reg u64 i oblocks tr;
-  stack u64 s_i;
+  #mmx reg u64 s_i;
 
   Kp = SHA256_K;
   Hp = _H;
@@ -395,7 +395,7 @@ inline fn __lastblocks_ref(reg u64 in inlen bits) -> stack u32[32], reg u64
 inline fn __sha256_ref(reg u64 out in inlen)
 {
   reg u64 bits nblocks;
-  stack u64 s_out s_bits;
+  #mmx reg u64 s_out s_bits;
   stack u32[8] H;
   reg ptr u32[8] Hp;
   stack u32[32] sblocks;

From 36658f181bbfc51d08c2d30ca73af96b743eda62 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Wed, 26 Jun 2024 07:45:17 +0100
Subject: [PATCH 06/19] sct: crypto_hash/sha512/amd64/ref

---
 src/crypto_hash/sha512/amd64/ref/hash.jazz   | 3 +++
 src/crypto_hash/sha512/amd64/ref/sha512.jinc | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/crypto_hash/sha512/amd64/ref/hash.jazz b/src/crypto_hash/sha512/amd64/ref/hash.jazz
index 76212246..9990d2dd 100644
--- a/src/crypto_hash/sha512/amd64/ref/hash.jazz
+++ b/src/crypto_hash/sha512/amd64/ref/hash.jazz
@@ -3,6 +3,9 @@ require "sha512.jinc"
 export fn jade_hash_sha512_amd64_ref(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha512_ref(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha512/amd64/ref/sha512.jinc b/src/crypto_hash/sha512/amd64/ref/sha512.jinc
index 184af39b..15c49814 100644
--- a/src/crypto_hash/sha512/amd64/ref/sha512.jinc
+++ b/src/crypto_hash/sha512/amd64/ref/sha512.jinc
@@ -192,7 +192,7 @@ fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64
   stack ptr u64[8] Hp;
   reg ptr u64[8] H;
   reg u64 tr;
-  stack u64 in_s;
+  #mmx reg u64 in_s;
 
   Kp = SHA512_K;
   Hp = _H;
@@ -275,9 +275,9 @@ fn _blocks_1_ref(reg ptr u64[8] _H, reg ptr u64[32] sblocks, reg u64 nblocks) ->
   reg ptr u64[80] Kp;
   stack ptr u64[8] Hp;
   reg ptr u64[8] H;
-  stack ptr u64[32] s_sblocks;
+  #mmx reg ptr u64[32] s_sblocks;
   reg u64 i oblocks tr;
-  stack u64 s_i;
+  #mmx reg u64 s_i;
 
   Kp = SHA512_K;
   Hp = _H;
@@ -395,7 +395,7 @@ inline fn __lastblocks_ref(reg u64 in inlen bits) -> stack u64[32], reg u64
 inline fn __sha512_ref(reg u64 out in inlen)
 {
   reg u64 bits nblocks;
-  stack u64 s_out s_bits;
+  #mmx reg u64 s_out s_bits;
   stack u64[8] H;
   reg ptr u64[8] Hp;
   stack u64[32] sblocks;

From 9268d1ca24059bf0d41fb69401aa728d7bb80165 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Wed, 26 Jun 2024 08:15:47 +0100
Subject: [PATCH 07/19] sct: crypto_hash sha3-* ref1

---
 .../keccak/keccak1600/amd64/ref1/keccak1600.jinc     | 12 ++++++------
 src/crypto_hash/sha3-224/amd64/ref1/hash.jazz        |  3 +++
 src/crypto_hash/sha3-256/amd64/ref1/hash.jazz        |  3 +++
 src/crypto_hash/sha3-384/amd64/ref1/hash.jazz        |  3 +++
 src/crypto_hash/sha3-512/amd64/ref1/hash.jazz        |  3 +++
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc
index c6dcf710..6d18b83e 100644
--- a/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc
@@ -87,7 +87,7 @@ inline fn __absorb_ref1(
   reg   u64 rate // rate already in bytes -- it is returned bc of spills
 ) -> reg ptr u64[25], reg u64
 {
-  stack u64 s_in s_inlen s_rate;
+  #mmx reg u64 s_in s_inlen s_rate;
   reg u8 trail_byte;
 
   // intermediate blocks
@@ -173,13 +173,13 @@ inline fn __xtr_bytes_ref1(
 
 inline fn __squeeze_ref1(
   reg mut ptr u64[25] state,
-  stack   u64 s_out,
-  reg     u64 outlen,
-  reg     u64 rate
+  #mmx reg u64 s_out,
+  reg      u64 outlen,
+  reg      u64 rate
 )
 {
   reg u64 out;
-  stack u64 s_outlen s_rate;
+  #mmx reg u64 s_outlen s_rate;
 
   // intermediate blocks
   while ( outlen > rate )
@@ -212,7 +212,7 @@ inline fn __keccak1600_ref1(reg u64 out outlen in inlen, reg u8 trail_byte, reg
 {
   stack u64[25] _state;
   reg ptr u64[25] state;
-  stack u64 s_out s_outlen;
+  #mmx reg u64 s_out s_outlen;
   stack u8 s_trail_byte;
 
   s_out = out;
diff --git a/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz
index df9387c2..6411abba 100644
--- a/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz
+++ b/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-224.jinc"
 export fn jade_hash_sha3_224_amd64_ref1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_224_ref1(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz
index e8a10bf8..0538261b 100644
--- a/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz
+++ b/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-256.jinc"
 export fn jade_hash_sha3_256_amd64_ref1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_256_ref1(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz
index 166e9a76..53a0e0cd 100644
--- a/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz
+++ b/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-384.jinc"
 export fn jade_hash_sha3_384_amd64_ref1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_384_ref1(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz
index 453a96a8..26126522 100644
--- a/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz
+++ b/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-512.jinc"
 export fn jade_hash_sha3_512_amd64_ref1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_512_ref1(hash, input, input_length);
   ?{}, r = #set0();
   return r;

From b639b734557358cc5e5e81388c9b608a6c7ad926 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Wed, 26 Jun 2024 08:20:38 +0100
Subject: [PATCH 08/19] sct: crypto_hash sha3-* avx2

---
 .../keccak1600/amd64/avx2/keccak1600.jinc     | 119 +++++++++++++-----
 .../keccak1600/amd64/avx2/keccakf1600.jinc    |   8 +-
 src/crypto_hash/sha3-224/amd64/avx2/hash.jazz |   7 +-
 .../sha3-224/amd64/avx2/sha3-224.jinc         |   4 +-
 src/crypto_hash/sha3-256/amd64/avx2/hash.jazz |   7 +-
 .../sha3-256/amd64/avx2/sha3-256.jinc         |   4 +-
 src/crypto_hash/sha3-384/amd64/avx2/hash.jazz |   7 +-
 .../sha3-384/amd64/avx2/sha3-384.jinc         |   4 +-
 src/crypto_hash/sha3-512/amd64/avx2/hash.jazz |   7 +-
 .../sha3-512/amd64/avx2/sha3-512.jinc         |   4 +-
 10 files changed, 122 insertions(+), 49 deletions(-)

diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc
index 4403e5bb..3cbacb8a 100644
--- a/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc
@@ -33,23 +33,32 @@ inline fn __add_full_block_avx2(
   stack u64[28] s_state,
   reg ptr u64[25] a_jagged_p,
   reg u64 in inlen,
-  reg u64 rate
-) -> reg u256[7], stack u64[28], reg u64, reg u64
+  reg u64 rate,
+  #msf reg u64 ms
+) -> reg u256[7], stack u64[28], reg u64, reg u64, #msf reg u64
 {
 
   inline int i;
   reg u64 j l t rate8;
+  reg bool loop_condition;
 
   rate8 = rate;
   rate8 >>= 3;
   j = 0;
-  while ( j < rate8 )
+  while { loop_condition = ( j < rate8 ); } ( loop_condition )
   {
+    ms = #update_msf(loop_condition, ms);
+
     t = [in + 8*j];
+
     l = a_jagged_p[(int) j];
+    l = #protect(l, ms);
+
     s_state[(int) l] = t;
     j += 1;
+
   }
+  ms = #update_msf(!loop_condition, ms);
 
   //TODO: check & change to #VPBROADCAST_4u64
   t = s_state[0];
@@ -63,7 +72,7 @@ inline fn __add_full_block_avx2(
   in += rate;
   inlen -= rate;
 
-  return state, s_state, in, inlen;
+  return state, s_state, in, inlen, ms;
 }
 
 
@@ -74,42 +83,56 @@ inline fn __add_final_block_avx2(
   reg ptr u64[25] a_jagged_p,
   reg   u64 in inlen,
   reg   u8  trail_byte,
-  reg   u64 rate
-) -> reg u256[7]
+  reg   u64 rate,
+  #msf reg u64 ms
+) -> reg u256[7], #msf reg u64
 {
   inline int i;
   reg u64 j l t inlen8;
   reg u8 c;
+  reg bool loop_condition;
 
   s_state = __init_s_state_avx2();
 
   inlen8 = inlen;
   inlen8 >>= 3;
   j = 0;
-  while ( j < inlen8 )
+
+  while { loop_condition = (j < inlen8); } ( loop_condition )
   {
+    ms = #update_msf(loop_condition, ms);
+
     t = [in + 8*j];
     l = a_jagged_p[(int) j];
+    l = #protect(l, ms);    
+
     s_state[(int) l] = t;
     j += 1;
   }
+  ms = #update_msf(!loop_condition, ms);
+
   l = a_jagged_p[(int) j];
+  l = #protect(l, ms);
+
   l <<= 3;
   j <<= 3;
 
-  while ( j < inlen )
+  while { loop_condition = ( j < inlen ); } ( loop_condition )
   {
+    ms = #update_msf(loop_condition, ms);
     c = (u8)[in + j];
     s_state[u8 (int) l] = c;
     j += 1;
     l += 1;
   }
+  ms = #update_msf(!loop_condition, ms);
 
   s_state[u8 (int) l] = trail_byte;
 
   // j  = (rate-1) >> 3;
   j = rate; j -= 1; j >>= 3;
   l  = a_jagged_p[(int) j];
+  l = #protect(l, ms);
   l <<= 3;
   // l += ((rate-1) & 0x7)
   j = rate; j -= 1; j &= 0x7;
@@ -125,7 +148,7 @@ inline fn __add_final_block_avx2(
   for i = 0 to 7
   { state[i] ^= s_state[u256 i]; }
 
-  return state;
+  return state, ms;
 }
 
 
@@ -134,12 +157,14 @@ inline fn __xtr_full_block_avx2(
   reg u256[7] state,
   reg ptr u64[25] a_jagged_p,
   reg u64 out,
-  reg u64 len
-) -> reg u64
+  reg u64 len,
+  #msf reg u64 ms
+) -> reg u64, #msf reg u64
 {
   inline int i;
   stack u64[28] s_state;
   reg u64 j l t len8;
+  reg bool loop_condition;
 
   for i = 0 to 7
   { s_state[u256 i] = state[i]; }
@@ -147,17 +172,22 @@ inline fn __xtr_full_block_avx2(
   len8 = len;
   len8 >>= 3;
   j = 0;
-  while ( j < len8 )
+  while { loop_condition = ( j < len8 ); } ( loop_condition )
   {
+    ms = #update_msf(loop_condition, ms);
+
     l = a_jagged_p[(int) j];
+    l = #protect(l, ms);    
+
     t = s_state[(int) l];
     [out + 8*j] = t;
     j += 1;
   }
+  ms = #update_msf(!loop_condition, ms);
 
   out += len;
 
-  return out;
+  return out, ms;
 }
 
 
@@ -166,13 +196,15 @@ inline fn __xtr_bytes_avx2(
   reg u256[7] state,
   reg ptr u64[25] a_jagged_p,
   reg u64 out,
-  reg u64 len
+  reg u64 len,
+  #msf reg u64 ms
 ) -> reg u64
 {
   inline int i;
   stack u64[28] s_state;
   reg u64 j l t len8;
   reg u8 c;
+  reg bool loop_condition;
 
   for i = 0 to 7
   { s_state[u256 i] = state[i]; }
@@ -180,13 +212,22 @@ inline fn __xtr_bytes_avx2(
   len8 = len;
   len8 >>= 3;
   j = 0;
-  while ( j < len8 )
-  { l = a_jagged_p[(int) j];
+  while { loop_condition = ( j < len8 ); } ( loop_condition )
+  { 
+    ms = #update_msf(loop_condition, ms);
+
+    l = a_jagged_p[(int) j];
+    l = #protect(l, ms);
+
     t = s_state[(int) l];
     [out + 8*j] = t;
     j += 1;
   }
+  ms = #update_msf(!loop_condition, ms);
+
   l = a_jagged_p[(int)j];
+  l = #protect(l, ms);
+
   j <<= 3;
   l <<= 3;
 
@@ -208,65 +249,75 @@ inline fn __absorb_avx2(
   reg u256[7] state,
   reg u64 in inlen,
   reg u8  trail_byte,
-  reg u64 rate
-) -> reg u256[7]
+  reg u64 rate,
+  #msf reg u64 ms
+) -> reg u256[7], #msf reg u64
 {
   stack u64[28] s_state;
   reg ptr u64[25] a_jagged_p;
+  reg bool loop_condition;
 
   a_jagged_p = KECCAK_A_JAGGED;
   s_state = __init_s_state_avx2();
 
   // intermediate blocks
-  while ( inlen >= rate )
+  while { loop_condition = (inlen >= rate); } (loop_condition)
   {
-    state, s_state, in, inlen = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate);
-    state = __keccakf1600_avx2(state);
+    ms = #update_msf(loop_condition, ms);
+
+    state, s_state, in, inlen, ms = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate, ms);
+
+    state, ms = __keccakf1600_avx2(state, ms);
   }
+  ms = #update_msf(!loop_condition, ms);
 
   // final block
-  state = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate);
+  state, ms = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate, ms);
 
-  return state;
+  return state, ms;
 }
 
 
-inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate)
+inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate, #msf reg u64 ms)
 {
   reg ptr u64[25] a_jagged_p;
+  reg bool loop_condition;
 
   a_jagged_p = KECCAK_A_JAGGED;
 
   // intermediate blocks
-  while ( outlen > rate )
+  while { loop_condition = (outlen > rate); } ( loop_condition )
   {
-    state = __keccakf1600_avx2(state);
-    out = __xtr_full_block_avx2(state, a_jagged_p, out, rate);
+    ms = #update_msf(loop_condition, ms);
+
+    state, ms = __keccakf1600_avx2(state, ms);
+    out, ms = __xtr_full_block_avx2(state, a_jagged_p, out, rate, ms);
     outlen -= rate;
   }
+  ms = #update_msf(!loop_condition, ms);
 
-  state = __keccakf1600_avx2(state);
-  out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen);
+  state, ms = __keccakf1600_avx2(state, ms);
+  out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen, ms);
 }
 
 
-inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate)
+inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate, #msf reg u64 ms)
 {
   reg u256[7] state;
 
   state = __keccak_init_avx2();
 
   // absorb
-  state = __absorb_avx2(state, in, inlen, trail_byte, rate);
+  state, ms = __absorb_avx2(state, in, inlen, trail_byte, rate, ms);
 
   // squeeze
-  __squeeze_avx2(state, out, outlen, rate);
+  __squeeze_avx2(state, out, outlen, rate, ms);
 }
 
 
-fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate)
+fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate, #msf reg u64 ms)
 {
-  __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+  __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms);
 }
 
 
diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc
index 6ca9dda6..907981ee 100644
--- a/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc
@@ -59,7 +59,7 @@ u64[25] KECCAK_A_JAGGED =
 };
 
 
-inline fn __keccakf1600_avx2(reg u256[7] state) -> reg u256[7]
+inline fn __keccakf1600_avx2(reg u256[7] state, #msf reg u64 ms) -> reg u256[7], #msf reg u64
 {
   reg u256[9] t;
   reg u256 c00 c14 d00 d14;
@@ -194,9 +194,11 @@ inline fn __keccakf1600_avx2(reg u256[7] state) -> reg u256[7]
     iotas_o += 32;
 
     _,_,_,zf,r = #DEC_64(r);
-  }(!zf)
+  }(!zf) { ms = #update_msf(!zf, ms); }
 
-  return state;
+  ms = #update_msf(zf, ms);
+
+  return state, ms;
 }
 
 
diff --git a/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz
index 77ae780a..97d4822a 100644
--- a/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz
+++ b/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz
@@ -3,7 +3,12 @@ require "sha3-224.jinc"
 export fn jade_hash_sha3_224_amd64_avx2(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
-  __sha3_224_avx2(hash, input, input_length);
+  #msf reg u64 ms;
+
+  ms = #init_msf();
+
+  __sha3_224_avx2(hash, input, input_length, ms);
+
   ?{}, r = #set0();
   return r;
 }
diff --git a/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc b/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc
index 10f0d31b..42e20d46 100644
--- a/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc
+++ b/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc
@@ -1,6 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc"
 
-inline fn __sha3_224_avx2(reg u64 out in inlen)
+inline fn __sha3_224_avx2(reg u64 out in inlen, #msf reg u64 ms)
 {
   reg u64 outlen rate;
   reg u8 trail_byte;
@@ -9,7 +9,7 @@ inline fn __sha3_224_avx2(reg u64 out in inlen)
   trail_byte = 0x6;
   rate = (1152/8);
 
-  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms);
 }
 
 
diff --git a/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz
index 462c1c0b..88f6b8ff 100644
--- a/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz
+++ b/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz
@@ -3,7 +3,12 @@ require "sha3-256.jinc"
 export fn jade_hash_sha3_256_amd64_avx2(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
-  __sha3_256_avx2(hash, input, input_length);
+  #msf reg u64 ms;
+
+  ms = #init_msf();
+
+  __sha3_256_avx2(hash, input, input_length, ms);
+
   ?{}, r = #set0();
   return r;
 }
diff --git a/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc b/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc
index ee575bb5..6a808935 100644
--- a/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc
+++ b/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc
@@ -1,6 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc"
 
-inline fn __sha3_256_avx2(reg u64 out in inlen)
+inline fn __sha3_256_avx2(reg u64 out in inlen, #msf reg u64 ms)
 {
   reg u64 outlen rate;
   reg u8 trail_byte;
@@ -9,7 +9,7 @@ inline fn __sha3_256_avx2(reg u64 out in inlen)
   trail_byte = 0x6;
   rate = (1088/8);
 
-  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms);
 }
 
 
diff --git a/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz
index 0be82db3..75e61f6c 100644
--- a/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz
+++ b/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz
@@ -3,7 +3,12 @@ require "sha3-384.jinc"
 export fn jade_hash_sha3_384_amd64_avx2(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
-  __sha3_384_avx2(hash, input, input_length);
+  #msf reg u64 ms;
+
+  ms = #init_msf();
+
+  __sha3_384_avx2(hash, input, input_length, ms);
+
   ?{}, r = #set0();
   return r;
 }
diff --git a/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc b/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc
index db29845f..4737c251 100644
--- a/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc
+++ b/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc
@@ -1,6 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc"
 
-inline fn __sha3_384_avx2(reg u64 out in inlen)
+inline fn __sha3_384_avx2(reg u64 out in inlen, #msf reg u64 ms)
 {
   reg u64 outlen rate;
   reg u8 trail_byte;
@@ -9,7 +9,7 @@ inline fn __sha3_384_avx2(reg u64 out in inlen)
   trail_byte = 0x6;
   rate = (832/8);
 
-  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms);
 }
 
 
diff --git a/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz
index 49335d0d..50070315 100644
--- a/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz
+++ b/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz
@@ -3,7 +3,12 @@ require "sha3-512.jinc"
 export fn jade_hash_sha3_512_amd64_avx2(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
-  __sha3_512_avx2(hash, input, input_length);
+  #msf reg u64 ms;
+
+  ms = #init_msf();
+
+  __sha3_512_avx2(hash, input, input_length, ms);
+
   ?{}, r = #set0();
   return r;
 }
diff --git a/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc b/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc
index 17ce4c24..0a9da967 100644
--- a/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc
+++ b/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc
@@ -1,6 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc"
 
-inline fn __sha3_512_avx2(reg u64 out in inlen)
+inline fn __sha3_512_avx2(reg u64 out in inlen, #msf reg u64 ms)
 {
   reg u64 outlen rate;
   reg u8 trail_byte;
@@ -9,7 +9,7 @@ inline fn __sha3_512_avx2(reg u64 out in inlen)
   trail_byte = 0x6;
   rate = (576/8);
 
-  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+  _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms);
 }
 
 

From 43fd681a17c3bd28c40f5a24bdd78a47d025e241 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Wed, 26 Jun 2024 08:34:56 +0100
Subject: [PATCH 09/19] sct: crypto_hash sha3-* ref and bmi1

---
 .../keccak/keccak1600/amd64/bmi1/keccak1600.jinc     | 12 ++++++------
 .../keccak/keccak1600/amd64/bmi1/keccakf1600.jinc    |  2 +-
 .../keccak/keccak1600/amd64/ref/keccak1600.jinc      |  8 ++++----
 src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz        |  3 +++
 src/crypto_hash/sha3-224/amd64/ref/hash.jazz         |  3 +++
 src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz        |  3 +++
 src/crypto_hash/sha3-256/amd64/ref/hash.jazz         |  3 +++
 src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz        |  3 +++
 src/crypto_hash/sha3-384/amd64/ref/hash.jazz         |  3 +++
 src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz        |  3 +++
 src/crypto_hash/sha3-512/amd64/ref/hash.jazz         |  3 +++
 11 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc
index fa81ca75..9e427d46 100644
--- a/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc
@@ -88,7 +88,7 @@ inline fn __absorb_bmi1(
   reg   u64 rate // rate already in bytes -- it is returned bc of spills
 ) -> reg ptr u64[25], reg u64
 {
-  stack u64 s_in s_inlen s_rate;
+  #mmx reg u64 s_in s_inlen s_rate;
   reg u8 trail_byte;
 
   // intermediate blocks
@@ -174,13 +174,13 @@ inline fn __xtr_bytes_bmi1(
 
 inline fn __squeeze_bmi1(
   reg mut ptr u64[25] state,
-  stack   u64 s_out,
-  reg     u64 outlen,
-  reg     u64 rate
+  #mmx reg u64 s_out,
+  reg      u64 outlen,
+  reg      u64 rate
 )
 {
   reg u64 out;
-  stack u64 s_outlen s_rate;
+  #mmx reg u64 s_outlen s_rate;
 
   // intermediate blocks
   while ( outlen > rate )
@@ -213,7 +213,7 @@ inline fn __keccak1600_bmi1(reg u64 out outlen in inlen, reg u8 trail_byte, reg
 {
   stack u64[25] _state;
   reg ptr u64[25] state;
-  stack u64 s_out s_outlen;
+  #mmx reg u64 s_out s_outlen;
   stack u8 s_trail_byte;
 
   s_out = out;
diff --git a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
index 565c69ae..40754c55 100644
--- a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
@@ -129,7 +129,7 @@ inline fn __round_bmi1(reg ptr u64[25] e a, reg u64 rc) -> reg ptr u64[25]
 inline fn __keccakf1600_bmi1(reg ptr u64[25] a) -> reg ptr u64[25]
 {
   reg ptr u64[24] RC;
-  stack ptr u64[24] s_RC;
+  #mmx reg ptr u64[24] s_RC;
   stack u64[25] s_e;
   reg ptr u64[25] e;
   reg u64 c rc;
diff --git a/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc
index cd718735..f903e7cb 100644
--- a/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc
@@ -87,7 +87,7 @@ inline fn __absorb_ref(
   reg   u64 rate // rate already in bytes -- it is returned bc of spills
 ) -> stack u64[25], reg u64
 {
-  stack u64 s_in s_inlen s_rate;
+  #mmx reg u64 s_in s_inlen s_rate;
   reg u8 trail_byte;
 
   // intermediate blocks
@@ -171,13 +171,13 @@ inline fn __xtr_bytes_ref(
 
 inline fn __squeeze_ref(
   stack u64[25] state,
-  stack u64 s_out,
+  #mmx reg u64 s_out,
   reg   u64 outlen,
   reg   u64 rate
 )
 {
   reg u64 out;
-  stack u64 s_outlen s_rate;
+  #mmx reg u64 s_outlen s_rate;
 
   // intermediate blocks
   while ( outlen > rate )
@@ -205,7 +205,7 @@ inline fn __squeeze_ref(
 inline fn __keccak1600_ref(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate)
 {
   stack u64[25] state;
-  stack u64 s_out s_outlen;
+  #mmx reg u64 s_out s_outlen;
   stack u8 s_trail_byte;
 
   s_out = out;
diff --git a/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz
index 9703da0d..df52afb9 100644
--- a/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz
+++ b/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-224.jinc"
 export fn jade_hash_sha3_224_amd64_bmi1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_224_bmi1(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-224/amd64/ref/hash.jazz b/src/crypto_hash/sha3-224/amd64/ref/hash.jazz
index 0bda7d05..9739444c 100644
--- a/src/crypto_hash/sha3-224/amd64/ref/hash.jazz
+++ b/src/crypto_hash/sha3-224/amd64/ref/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-224.jinc"
 export fn jade_hash_sha3_224_amd64_ref(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_224_ref(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz
index f2e646cb..bba2e585 100644
--- a/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz
+++ b/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-256.jinc"
 export fn jade_hash_sha3_256_amd64_bmi1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_256_bmi1(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-256/amd64/ref/hash.jazz b/src/crypto_hash/sha3-256/amd64/ref/hash.jazz
index 6c381cce..bfa36c72 100644
--- a/src/crypto_hash/sha3-256/amd64/ref/hash.jazz
+++ b/src/crypto_hash/sha3-256/amd64/ref/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-256.jinc"
 export fn jade_hash_sha3_256_amd64_ref(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_256_ref(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz
index 6090b84e..48b124fb 100644
--- a/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz
+++ b/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-384.jinc"
 export fn jade_hash_sha3_384_amd64_bmi1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_384_bmi1(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-384/amd64/ref/hash.jazz b/src/crypto_hash/sha3-384/amd64/ref/hash.jazz
index fb952862..65518d29 100644
--- a/src/crypto_hash/sha3-384/amd64/ref/hash.jazz
+++ b/src/crypto_hash/sha3-384/amd64/ref/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-384.jinc"
 export fn jade_hash_sha3_384_amd64_ref(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_384_ref(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz
index 79a4f3ce..4ff72114 100644
--- a/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz
+++ b/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-512.jinc"
 export fn jade_hash_sha3_512_amd64_bmi1(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_512_bmi1(hash, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_hash/sha3-512/amd64/ref/hash.jazz b/src/crypto_hash/sha3-512/amd64/ref/hash.jazz
index aa265621..c127947c 100644
--- a/src/crypto_hash/sha3-512/amd64/ref/hash.jazz
+++ b/src/crypto_hash/sha3-512/amd64/ref/hash.jazz
@@ -3,6 +3,9 @@ require "sha3-512.jinc"
 export fn jade_hash_sha3_512_amd64_ref(reg u64 hash input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __sha3_512_ref(hash, input, input_length);
   ?{}, r = #set0();
   return r;

From a283669451be00372fffe73eccdb60e24b138216 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Wed, 26 Jun 2024 08:50:25 +0100
Subject: [PATCH 10/19] sct: crypto_xof

---
 src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc | 2 +-
 src/crypto_xof/shake128/amd64/avx2/shake128.jinc        | 4 ++--
 src/crypto_xof/shake128/amd64/avx2/xof.jazz             | 6 +++++-
 src/crypto_xof/shake128/amd64/bmi1/xof.jazz             | 3 +++
 src/crypto_xof/shake128/amd64/ref/xof.jazz              | 3 +++
 src/crypto_xof/shake128/amd64/ref1/xof.jazz             | 3 +++
 src/crypto_xof/shake256/amd64/avx2/shake256.jinc        | 4 ++--
 src/crypto_xof/shake256/amd64/avx2/xof.jazz             | 7 ++++++-
 src/crypto_xof/shake256/amd64/bmi1/xof.jazz             | 3 +++
 src/crypto_xof/shake256/amd64/ref/xof.jazz              | 3 +++
 src/crypto_xof/shake256/amd64/ref1/xof.jazz             | 3 +++
 src/crypto_xof/shake256/amd64/spec/xof.jazz             | 3 +++
 12 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc
index 7dd3b9e3..783813b2 100644
--- a/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc
@@ -136,7 +136,7 @@ inline fn __xtr_bytes_spec(
 inline fn __keccak1600_spec(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate)
 {
   stack u64[25] state;
-  stack u64 s_out s_outlen s_in s_inlen s_rate;
+  #mmx reg u64 s_out s_outlen s_in s_inlen s_rate;
   stack u8 s_trail_byte;
 
   s_out = out;
diff --git a/src/crypto_xof/shake128/amd64/avx2/shake128.jinc b/src/crypto_xof/shake128/amd64/avx2/shake128.jinc
index 187aac91..70875d0c 100644
--- a/src/crypto_xof/shake128/amd64/avx2/shake128.jinc
+++ b/src/crypto_xof/shake128/amd64/avx2/shake128.jinc
@@ -1,6 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc"
 
-inline fn __shake128_avx2(reg u64 out outlen in inlen)
+inline fn __shake128_avx2(reg u64 out outlen in inlen, #msf reg u64 ms)
 {
   reg u64 rate;
   reg u8 trail_byte;
@@ -8,7 +8,7 @@ inline fn __shake128_avx2(reg u64 out outlen in inlen)
   trail_byte = 0x1F;
   rate = (1344/8);
 
-  __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+  __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms);
 }
 
 
diff --git a/src/crypto_xof/shake128/amd64/avx2/xof.jazz b/src/crypto_xof/shake128/amd64/avx2/xof.jazz
index 23dd3b45..6b41b262 100644
--- a/src/crypto_xof/shake128/amd64/avx2/xof.jazz
+++ b/src/crypto_xof/shake128/amd64/avx2/xof.jazz
@@ -3,7 +3,11 @@ require "shake128.jinc"
 export fn jade_xof_shake128_amd64_avx2(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
-  __shake128_avx2(output, output_length, input, input_length);
+  #msf reg u64 ms;
+
+  ms = #init_msf();
+
+  __shake128_avx2(output, output_length, input, input_length, ms);
   ?{}, r = #set0();
   return r;
 }
diff --git a/src/crypto_xof/shake128/amd64/bmi1/xof.jazz b/src/crypto_xof/shake128/amd64/bmi1/xof.jazz
index 19921991..4c32db8d 100644
--- a/src/crypto_xof/shake128/amd64/bmi1/xof.jazz
+++ b/src/crypto_xof/shake128/amd64/bmi1/xof.jazz
@@ -3,6 +3,9 @@ require "shake128.jinc"
 export fn jade_xof_shake128_amd64_bmi1(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __shake128_bmi1(output, output_length, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_xof/shake128/amd64/ref/xof.jazz b/src/crypto_xof/shake128/amd64/ref/xof.jazz
index ad386786..3cb07b30 100644
--- a/src/crypto_xof/shake128/amd64/ref/xof.jazz
+++ b/src/crypto_xof/shake128/amd64/ref/xof.jazz
@@ -3,6 +3,9 @@ require "shake128.jinc"
 export fn jade_xof_shake128_amd64_ref(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __shake128_ref(output, output_length, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_xof/shake128/amd64/ref1/xof.jazz b/src/crypto_xof/shake128/amd64/ref1/xof.jazz
index 28e571ea..063f9637 100644
--- a/src/crypto_xof/shake128/amd64/ref1/xof.jazz
+++ b/src/crypto_xof/shake128/amd64/ref1/xof.jazz
@@ -3,6 +3,9 @@ require "shake128.jinc"
 export fn jade_xof_shake128_amd64_ref1(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __shake128_ref1(output, output_length, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_xof/shake256/amd64/avx2/shake256.jinc b/src/crypto_xof/shake256/amd64/avx2/shake256.jinc
index 37c02fef..17a24c27 100644
--- a/src/crypto_xof/shake256/amd64/avx2/shake256.jinc
+++ b/src/crypto_xof/shake256/amd64/avx2/shake256.jinc
@@ -1,6 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc"
 
-inline fn __shake256_avx2(reg u64 out outlen in inlen)
+inline fn __shake256_avx2(reg u64 out outlen in inlen, #msf reg u64 ms)
 {
   reg u64 rate;
   reg u8 trail_byte;
@@ -8,7 +8,7 @@ inline fn __shake256_avx2(reg u64 out outlen in inlen)
   trail_byte = 0x1F;
   rate = (1088/8);
 
-  __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+  __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms);
 }
 
 
diff --git a/src/crypto_xof/shake256/amd64/avx2/xof.jazz b/src/crypto_xof/shake256/amd64/avx2/xof.jazz
index 169f7701..80d3ae69 100644
--- a/src/crypto_xof/shake256/amd64/avx2/xof.jazz
+++ b/src/crypto_xof/shake256/amd64/avx2/xof.jazz
@@ -3,7 +3,12 @@ require "shake256.jinc"
 export fn jade_xof_shake256_amd64_avx2(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
-  __shake256_avx2(output, output_length, input, input_length);
+  #msf reg u64 ms;
+
+  ms = #init_msf();
+
+  __shake256_avx2(output, output_length, input, input_length, ms);
+
   ?{}, r = #set0();
   return r;
 }
diff --git a/src/crypto_xof/shake256/amd64/bmi1/xof.jazz b/src/crypto_xof/shake256/amd64/bmi1/xof.jazz
index f0988dd9..0a01874f 100644
--- a/src/crypto_xof/shake256/amd64/bmi1/xof.jazz
+++ b/src/crypto_xof/shake256/amd64/bmi1/xof.jazz
@@ -3,6 +3,9 @@ require "shake256.jinc"
 export fn jade_xof_shake256_amd64_bmi1(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __shake256_bmi1(output, output_length, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_xof/shake256/amd64/ref/xof.jazz b/src/crypto_xof/shake256/amd64/ref/xof.jazz
index 8eb4e643..c876881e 100644
--- a/src/crypto_xof/shake256/amd64/ref/xof.jazz
+++ b/src/crypto_xof/shake256/amd64/ref/xof.jazz
@@ -3,6 +3,9 @@ require "shake256.jinc"
 export fn jade_xof_shake256_amd64_ref(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __shake256_ref(output, output_length, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_xof/shake256/amd64/ref1/xof.jazz b/src/crypto_xof/shake256/amd64/ref1/xof.jazz
index 2051d26f..23d811bb 100644
--- a/src/crypto_xof/shake256/amd64/ref1/xof.jazz
+++ b/src/crypto_xof/shake256/amd64/ref1/xof.jazz
@@ -3,6 +3,9 @@ require "shake256.jinc"
 export fn jade_xof_shake256_amd64_ref1(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __shake256_ref1(output, output_length, input, input_length);
   ?{}, r = #set0();
   return r;
diff --git a/src/crypto_xof/shake256/amd64/spec/xof.jazz b/src/crypto_xof/shake256/amd64/spec/xof.jazz
index f7045070..04b30887 100644
--- a/src/crypto_xof/shake256/amd64/spec/xof.jazz
+++ b/src/crypto_xof/shake256/amd64/spec/xof.jazz
@@ -3,6 +3,9 @@ require "shake256.jinc"
 export fn jade_xof_shake256_amd64_spec(reg u64 output output_length input input_length) -> reg u64
 {
   reg u64 r;
+
+  _ = #init_msf();
+
   __shake256_spec(output, output_length, input, input_length);
   ?{}, r = #set0();
   return r;

From 11385db258b2b42d259babc3eb49897b6ca49d73 Mon Sep 17 00:00:00 2001
From: Tiago Oliveira <tfaoliveira@gmail.com>
Date: Thu, 27 Jun 2024 12:42:32 +0100
Subject: [PATCH 11/19] sct: fix compilation of kyber* (no sct yet); there will
 be a separate PR;

---
 src/common/keccak/common/fips202_DIRTY.jinc   |   6 +-
 .../amd64/avx2/keccak1600_nomsf.jinc          | 272 ++++++++++++++++++
 .../amd64/avx2/keccakf1600_nomsf.jinc         | 202 +++++++++++++
 3 files changed, 476 insertions(+), 4 deletions(-)
 create mode 100644 src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc
 create mode 100644 src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc

diff --git a/src/common/keccak/common/fips202_DIRTY.jinc b/src/common/keccak/common/fips202_DIRTY.jinc
index 92698c60..82f6c335 100644
--- a/src/common/keccak/common/fips202_DIRTY.jinc
+++ b/src/common/keccak/common/fips202_DIRTY.jinc
@@ -1,7 +1,5 @@
-param int KECCAK_ROUNDS=24;
-
-from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc"
-from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc"
+from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc"
+from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc"
 require "fips202_params.jinc"
 
 #[returnaddress="stack"]
diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc
new file mode 100644
index 00000000..0f6ace84
--- /dev/null
+++ b/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc
@@ -0,0 +1,272 @@
+param int KECCAK_ROUNDS=24;
+
+require "keccakf1600_nomsf.jinc"
+
+inline fn __keccak_init_avx2() -> reg u256[7]
+{
+  inline int i;
+  reg u256[7] state;
+
+  for i=0 to 7
+  { state[i] = #set0_256(); }
+
+  return state;
+}
+
+
+inline fn __init_s_state_avx2() -> stack u64[28]
+{
+  inline int i;
+  stack u64[28] s_state;
+  reg u256 zero;
+
+  zero = #set0_256();
+  for i=0 to 7
+  { s_state[u256 i] = zero; }
+
+  return s_state;
+}
+
+
+inline fn __add_full_block_avx2(
+  reg u256[7] state,
+  stack u64[28] s_state,
+  reg ptr u64[25] a_jagged_p,
+  reg u64 in inlen,
+  reg u64 rate
+) -> reg u256[7], stack u64[28], reg u64, reg u64
+{
+
+  inline int i;
+  reg u64 j l t rate8;
+
+  rate8 = rate;
+  rate8 >>= 3;
+  j = 0;
+  while ( j < rate8 )
+  {
+    t = [in + 8*j];
+    l = a_jagged_p[(int) j];
+    s_state[(int) l] = t;
+    j += 1;
+  }
+
+  //TODO: check & change to #VPBROADCAST_4u64
+  t = s_state[0];
+  s_state[1] = t;
+  s_state[2] = t;
+  s_state[3] = t;
+
+  for i = 0 to 7
+  { state[i] ^= s_state[u256 i]; }
+
+  in += rate;
+  inlen -= rate;
+
+  return state, s_state, in, inlen;
+}
+
+
+// TODO: refactor when this feature is available: https://github.com/haslab/libjbn/wiki/Feature-request-%231#procedural-parameters
+inline fn __add_final_block_avx2(
+  reg  u256[7] state,
+  stack u64[28] s_state,
+  reg ptr u64[25] a_jagged_p,
+  reg   u64 in inlen,
+  reg   u8  trail_byte,
+  reg   u64 rate
+) -> reg u256[7]
+{
+  inline int i;
+  reg u64 j l t inlen8;
+  reg u8 c;
+
+  s_state = __init_s_state_avx2();
+
+  inlen8 = inlen;
+  inlen8 >>= 3;
+  j = 0;
+  while ( j < inlen8 )
+  {
+    t = [in + 8*j];
+    l = a_jagged_p[(int) j];
+    s_state[(int) l] = t;
+    j += 1;
+  }
+  l = a_jagged_p[(int) j];
+  l <<= 3;
+  j <<= 3;
+
+  while ( j < inlen )
+  {
+    c = (u8)[in + j];
+    s_state[u8 (int) l] = c;
+    j += 1;
+    l += 1;
+  }
+
+  s_state[u8 (int) l] = trail_byte;
+
+  // j  = (rate-1) >> 3;
+  j = rate; j -= 1; j >>= 3;
+  l  = a_jagged_p[(int) j];
+  l <<= 3;
+  // l += ((rate-1) & 0x7)
+  j = rate; j -= 1; j &= 0x7;
+  l += j;
+
+  s_state[u8 (int) l] ^= 0x80;
+
+  t = s_state[0];
+  s_state[1] = t;
+  s_state[2] = t;
+  s_state[3] = t;
+
+  for i = 0 to 7
+  { state[i] ^= s_state[u256 i]; }
+
+  return state;
+}
+
+
+// obs: @pre: len <= rate_in_bytes
+inline fn __xtr_full_block_avx2(
+  reg u256[7] state,
+  reg ptr u64[25] a_jagged_p,
+  reg u64 out,
+  reg u64 len
+) -> reg u64
+{
+  inline int i;
+  stack u64[28] s_state;
+  reg u64 j l t len8;
+
+  for i = 0 to 7
+  { s_state[u256 i] = state[i]; }
+
+  len8 = len;
+  len8 >>= 3;
+  j = 0;
+  while ( j < len8 )
+  {
+    l = a_jagged_p[(int) j];
+    t = s_state[(int) l];
+    [out + 8*j] = t;
+    j += 1;
+  }
+
+  out += len;
+
+  return out;
+}
+
+
+// obs: @pre: len <= rate_in_bytes
+inline fn __xtr_bytes_avx2(
+  reg u256[7] state,
+  reg ptr u64[25] a_jagged_p,
+  reg u64 out,
+  reg u64 len
+) -> reg u64
+{
+  inline int i;
+  stack u64[28] s_state;
+  reg u64 j l t len8;
+  reg u8 c;
+
+  for i = 0 to 7
+  { s_state[u256 i] = state[i]; }
+
+  len8 = len;
+  len8 >>= 3;
+  j = 0;
+  while ( j < len8 )
+  { l = a_jagged_p[(int) j];
+    t = s_state[(int) l];
+    [out + 8*j] = t;
+    j += 1;
+  }
+  l = a_jagged_p[(int)j];
+  j <<= 3;
+  l <<= 3;
+
+  while ( j < len )
+  {
+    c = s_state[u8 (int) l];
+    (u8)[out + j] = c;
+    j += 1;
+    l += 1;
+  }
+
+  out += len;
+
+  return out;
+}
+
+
+inline fn __absorb_avx2(
+  reg u256[7] state,
+  reg u64 in inlen,
+  reg u8  trail_byte,
+  reg u64 rate
+) -> reg u256[7]
+{
+  stack u64[28] s_state;
+  reg ptr u64[25] a_jagged_p;
+
+  a_jagged_p = KECCAK_A_JAGGED;
+  s_state = __init_s_state_avx2();
+
+  // intermediate blocks
+  while ( inlen >= rate )
+  {
+    state, s_state, in, inlen = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate);
+    state = __keccakf1600_avx2(state);
+  }
+
+  // final block
+  state = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate);
+
+  return state;
+}
+
+
+inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate)
+{
+  reg ptr u64[25] a_jagged_p;
+
+  a_jagged_p = KECCAK_A_JAGGED;
+
+  // intermediate blocks
+  while ( outlen > rate )
+  {
+    state = __keccakf1600_avx2(state);
+    out = __xtr_full_block_avx2(state, a_jagged_p, out, rate);
+    outlen -= rate;
+  }
+
+  state = __keccakf1600_avx2(state);
+  out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen);
+}
+
+
+inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate)
+{
+  reg u256[7] state;
+
+  state = __keccak_init_avx2();
+
+  // absorb
+  state = __absorb_avx2(state, in, inlen, trail_byte, rate);
+
+  // squeeze
+  __squeeze_avx2(state, out, outlen, rate);
+}
+
+
+fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate)
+{
+  __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate);
+}
+
+
diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc
new file mode 100644
index 00000000..6ca9dda6
--- /dev/null
+++ b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc
@@ -0,0 +1,202 @@
+
+u256[24] KECCAK_IOTAS =
+{  (4u64)[0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001]
+  ,(4u64)[0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082]
+  ,(4u64)[0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a]
+  ,(4u64)[0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000]
+  ,(4u64)[0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b]
+  ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001]
+  ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081]
+  ,(4u64)[0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009]
+  ,(4u64)[0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a]
+  ,(4u64)[0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088]
+  ,(4u64)[0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009]
+  ,(4u64)[0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a]
+  ,(4u64)[0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b]
+  ,(4u64)[0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b]
+  ,(4u64)[0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089]
+  ,(4u64)[0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003]
+  ,(4u64)[0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002]
+  ,(4u64)[0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080]
+  ,(4u64)[0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a]
+  ,(4u64)[0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a]
+  ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081]
+  ,(4u64)[0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080]
+  ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001]
+  ,(4u64)[0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008]
+};
+
+
+u256[6] KECCAK_RHOTATES_LEFT = 
+{
+  (4u64)[41, 36, 18,  3],
+  (4u64)[27, 28, 62,  1],
+  (4u64)[39, 56,  6, 45],
+  (4u64)[ 8, 55, 61, 10],
+  (4u64)[20, 25, 15,  2],
+  (4u64)[14, 21, 43, 44]
+};
+
+
+u256[6] KECCAK_RHOTATES_RIGHT =
+{
+  (4u64)[64-41, 64-36, 64-18, 64- 3],
+  (4u64)[64-27, 64-28, 64-62, 64- 1],
+  (4u64)[64-39, 64-56, 64- 6, 64-45],
+  (4u64)[64- 8, 64-55, 64-61, 64-10],
+  (4u64)[64-20, 64-25, 64-15, 64- 2],
+  (4u64)[64-14, 64-21, 64-43, 64-44]
+};
+
+
+u64[25] KECCAK_A_JAGGED = 
+{
+   0,  4,  5,  6,  7,
+  10, 24, 13, 18, 23,
+   8, 16, 25, 22, 15,
+  11, 12, 21, 26, 19,
+   9, 20, 17, 14, 27
+};
+
+
+inline fn __keccakf1600_avx2(reg u256[7] state) -> reg u256[7]
+{
+  reg u256[9] t;
+  reg u256 c00 c14 d00 d14;
+
+  reg bool zf;
+  reg u64 r iotas_o;
+
+  reg ptr u256[24] iotas_p;
+  reg ptr u256[6] rhotates_left_p;
+  reg ptr u256[6] rhotates_right_p;
+
+  iotas_p = KECCAK_IOTAS;
+  iotas_o = 0;
+  rhotates_left_p = KECCAK_RHOTATES_LEFT;
+  rhotates_right_p = KECCAK_RHOTATES_RIGHT;
+
+  r = KECCAK_ROUNDS;
+  while
+  {
+	  //######################################## Theta
+	  c00 = #VPSHUFD_256(state[2], (4u2)[1,0,3,2]);
+	  c14 = state[5] ^ state[3];
+	  t[2] = state[4] ^ state[6];
+	  c14 = c14 ^ state[1];
+	  c14 = c14 ^ t[2];
+	  t[4] = #VPERMQ(c14, (4u2)[2,1,0,3]);
+	  c00 = c00 ^ state[2];
+	  t[0] = #VPERMQ(c00, (4u2)[1,0,3,2]);
+	  t[1] = c14 >>4u64 63;
+	  t[2] = c14 +4u64 c14;
+	  t[1] = t[1] | t[2];
+	  d14 = #VPERMQ(t[1], (4u2)[0,3,2,1]);
+	  d00 = t[1] ^ t[4];
+	  d00 = #VPERMQ(d00, (4u2)[0,0,0,0]);
+	  c00 = c00 ^ state[0];
+	  c00 = c00 ^ t[0];
+	  t[0] = c00 >>4u64 63;
+	  t[1] = c00 +4u64 c00;
+	  t[1] = t[1] | t[0];
+	  state[2] = state[2] ^ d00;
+	  state[0] = state[0] ^ d00;
+	  d14 = #VPBLEND_8u32(d14, t[1], (8u1)[1,1,0,0,0,0,0,0]);
+	  t[4] = #VPBLEND_8u32(t[4], c00, (8u1)[0,0,0,0,0,0,1,1]);
+	  d14 = d14 ^ t[4];
+
+	  //######################################## Rho + Pi + pre-Chi shuffle
+    t[3] = #VPSLLV_4u64(state[2], rhotates_left_p[0] );
+	  state[2] = #VPSRLV_4u64(state[2], rhotates_right_p[0] );
+	  state[2] = state[2] | t[3];
+	  state[3] = state[3] ^ d14;
+	  t[4] = #VPSLLV_4u64(state[3], rhotates_left_p[2] );
+	  state[3] = #VPSRLV_4u64(state[3], rhotates_right_p[2] );
+	  state[3] = state[3] | t[4];
+	  state[4] = state[4] ^ d14;
+	  t[5] = #VPSLLV_4u64(state[4], rhotates_left_p[3] );
+	  state[4] = #VPSRLV_4u64(state[4], rhotates_right_p[3] );
+	  state[4] = state[4] | t[5];
+	  state[5] = state[5] ^ d14;
+	  t[6] = #VPSLLV_4u64(state[5], rhotates_left_p[4] );
+	  state[5] = #VPSRLV_4u64(state[5], rhotates_right_p[4] );
+	  state[5] = state[5] | t[6];
+	  state[6] = state[6] ^ d14;
+	  t[3] = #VPERMQ(state[2], (4u2)[2,0,3,1]);
+	  t[4] = #VPERMQ(state[3], (4u2)[2,0,3,1]);
+	  t[7] = #VPSLLV_4u64(state[6], rhotates_left_p[5] );
+	  t[1] = #VPSRLV_4u64(state[6], rhotates_right_p[5] );
+	  t[1] = t[1] | t[7];
+	  state[1] = state[1] ^ d14;
+	  t[5] = #VPERMQ(state[4], (4u2)[0,1,2,3]);
+	  t[6] = #VPERMQ(state[5], (4u2)[1,3,0,2]);
+	  t[8] = #VPSLLV_4u64(state[1], rhotates_left_p[1] );
+	  t[2] = #VPSRLV_4u64(state[1], rhotates_right_p[1] );
+	  t[2] = t[2] | t[8];
+
+	  //######################################## Chi
+	  t[7] = #VPSRLDQ_256(t[1], 8);
+	  t[0] = !t[1] & t[7];
+	  state[3] = #VPBLEND_8u32(t[2], t[6], (8u1)[0,0,0,0,1,1,0,0]);
+	  t[8] = #VPBLEND_8u32(t[4], t[2], (8u1)[0,0,0,0,1,1,0,0]);
+	  state[5] = #VPBLEND_8u32(t[3], t[4], (8u1)[0,0,0,0,1,1,0,0]);
+	  t[7] = #VPBLEND_8u32(t[2], t[3], (8u1)[0,0,0,0,1,1,0,0]);
+	  state[3] = #VPBLEND_8u32(state[3], t[4], (8u1)[0,0,1,1,0,0,0,0]);
+	  t[8] = #VPBLEND_8u32(t[8], t[5], (8u1)[0,0,1,1,0,0,0,0]);
+	  state[5] = #VPBLEND_8u32(state[5], t[2], (8u1)[0,0,1,1,0,0,0,0]);
+	  t[7] = #VPBLEND_8u32(t[7], t[6], (8u1)[0,0,1,1,0,0,0,0]);
+	  state[3] = #VPBLEND_8u32(state[3], t[5], (8u1)[1,1,0,0,0,0,0,0]);
+	  t[8] = #VPBLEND_8u32(t[8], t[6], (8u1)[1,1,0,0,0,0,0,0]);
+	  state[5] = #VPBLEND_8u32(state[5], t[6], (8u1)[1,1,0,0,0,0,0,0]);
+	  t[7] = #VPBLEND_8u32(t[7], t[4], (8u1)[1,1,0,0,0,0,0,0]);
+	  state[3] = !state[3] & t[8];
+	  state[5] = !state[5] & t[7];
+	  state[6] = #VPBLEND_8u32(t[5], t[2], (8u1)[0,0,0,0,1,1,0,0]);
+	  t[8] = #VPBLEND_8u32(t[3], t[5], (8u1)[0,0,0,0,1,1,0,0]);
+	  state[3] = state[3] ^ t[3];
+	  state[6] = #VPBLEND_8u32(state[6], t[3], (8u1)[0,0,1,1,0,0,0,0]);
+	  t[8] = #VPBLEND_8u32(t[8], t[4], (8u1)[0,0,1,1,0,0,0,0]);
+	  state[5] = state[5] ^ t[5];
+	  state[6] = #VPBLEND_8u32(state[6], t[4], (8u1)[1,1,0,0,0,0,0,0]);
+	  t[8] = #VPBLEND_8u32(t[8], t[2], (8u1)[1,1,0,0,0,0,0,0]);
+	  state[6] = !state[6] & t[8];
+	  state[6] = state[6] ^ t[6];
+	  state[4] = #VPERMQ(t[1], (4u2)[0,1,3,2]);
+	  t[8] = #VPBLEND_8u32(state[4], state[0], (8u1)[0,0,1,1,0,0,0,0]);
+	  state[1] = #VPERMQ(t[1], (4u2)[0,3,2,1]);
+	  state[1] = #VPBLEND_8u32(state[1], state[0], (8u1)[1,1,0,0,0,0,0,0]);
+	  state[1] = !state[1] & t[8];
+	  state[2] = #VPBLEND_8u32(t[4], t[5], (8u1)[0,0,0,0,1,1,0,0]);
+	  t[7] = #VPBLEND_8u32(t[6], t[4], (8u1)[0,0,0,0,1,1,0,0]);
+	  state[2] = #VPBLEND_8u32(state[2], t[6], (8u1)[0,0,1,1,0,0,0,0]);
+	  t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[0,0,1,1,0,0,0,0]);
+	  state[2] = #VPBLEND_8u32(state[2], t[3], (8u1)[1,1,0,0,0,0,0,0]);
+	  t[7] = #VPBLEND_8u32(t[7], t[5], (8u1)[1,1,0,0,0,0,0,0]);
+	  state[2] = !state[2] & t[7];
+	  state[2] = state[2] ^ t[2];
+	  t[0] = #VPERMQ(t[0], (4u2)[0,0,0,0]);
+	  state[3] = #VPERMQ(state[3], (4u2)[0,1,2,3]);
+	  state[5] = #VPERMQ(state[5], (4u2)[2,0,3,1]);
+	  state[6] = #VPERMQ(state[6], (4u2)[1,3,0,2]);
+	  state[4] = #VPBLEND_8u32(t[6], t[3], (8u1)[0,0,0,0,1,1,0,0]);
+	  t[7] = #VPBLEND_8u32(t[5], t[6], (8u1)[0,0,0,0,1,1,0,0]);
+	  state[4] = #VPBLEND_8u32(state[4], t[5], (8u1)[0,0,1,1,0,0,0,0]);
+	  t[7] = #VPBLEND_8u32(t[7], t[2], (8u1)[0,0,1,1,0,0,0,0]);
+	  state[4] = #VPBLEND_8u32(state[4], t[2], (8u1)[1,1,0,0,0,0,0,0]);
+	  t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[1,1,0,0,0,0,0,0]);
+	  state[4] = !state[4] & t[7];
+	  state[0] = state[0] ^ t[0];
+	  state[1] = state[1] ^ t[1];
+	  state[4] = state[4] ^ t[4];
+
+	  //######################################## Iota
+	  state[0] = state[0] ^ iotas_p.[(int) iotas_o];
+    iotas_o += 32;
+
+    _,_,_,zf,r = #DEC_64(r);
+  }(!zf)
+
+  return state;
+}
+
+

From 1a0cf66b4ff6adf45bfae21f0aad6c0c11502ed8 Mon Sep 17 00:00:00 2001
From: Vincent Laporte <Vincent.Laporte@inria.fr>
Date: Thu, 27 Jun 2024 17:45:32 +0200
Subject: [PATCH 12/19] Remove declarations of unused variables

---
 src/common/keccak/common/fips202_DIRTY.jinc     |  1 -
 .../kyber/common/amd64/avx2/poly.jinc           | 17 +++--------------
 .../kyber/common/amd64/avx2/polyvec.jinc        |  2 --
 src/crypto_kem/kyber/common/amd64/kem.jinc      |  1 -
 src/crypto_kem/kyber/common/amd64/ref/poly.jinc |  3 ---
 .../kyber/common/amd64/ref/polyvec.jinc         |  2 --
 .../kyber/common/amd64/ref/verify.jinc          |  2 +-
 .../kyber/kyber512/amd64/ref/indcpa.jinc        |  2 +-
 .../kyber/kyber768/amd64/avx2/gen_matrix.jinc   |  1 -
 .../kyber/kyber768/amd64/ref/indcpa.jinc        | 10 ++++------
 .../kyber/kyber768/amd64/ref/poly.jinc          |  4 ----
 .../kyber/kyber768/amd64/ref/polyvec.jinc       |  2 --
 .../kyber/kyber768/amd64/ref/verify.jinc        |  2 +-
 .../curve25519/amd64/ref5/scalarmult.jazz       |  2 --
 .../dilithium/common/amd64/avx2/expandA.jinc    |  1 -
 .../common/amd64/avx2/expandA_end.jinc          |  4 ----
 .../dilithium/common/amd64/avx2/expandMask.jinc |  4 ----
 .../common/amd64/avx2/expandMask_end.jinc       |  2 --
 .../dilithium/common/amd64/avx2/ntt.jinc        |  7 ++-----
 .../dilithium/common/amd64/expandS.jinc         |  7 ++-----
 .../dilithium/common/amd64/fips202.jinc         |  1 -
 .../dilithium/common/amd64/keygen_end.jinc      |  5 -----
 .../dilithium/common/amd64/packing.jinc         |  5 +----
 .../dilithium/common/amd64/poly.jinc            |  5 +----
 .../dilithium/common/amd64/verify_end.jinc      | 13 ++++---------
 .../dilithium/dilithium2/amd64/avx2/common.jinc |  5 +----
 .../dilithium/dilithium3/amd64/avx2/common.jinc |  3 ---
 .../falcon/falcon512/amd64/avx2/test.jazz       |  1 -
 28 files changed, 21 insertions(+), 93 deletions(-)

diff --git a/src/common/keccak/common/fips202_DIRTY.jinc b/src/common/keccak/common/fips202_DIRTY.jinc
index 82f6c335..cc4c86af 100644
--- a/src/common/keccak/common/fips202_DIRTY.jinc
+++ b/src/common/keccak/common/fips202_DIRTY.jinc
@@ -336,7 +336,6 @@ fn _sha3_512_32(reg ptr u8[64] out, reg const ptr u8[32] in) -> stack u8[64]
 inline
 fn __shake128_absorb34(reg u256[7] state, reg const ptr u8[34] in) -> reg u256[7]
 {
-  reg u128 t128;
   stack u64[28] s_state;
   stack u64[25] a_jagged_p;
   reg u64 l t;
diff --git a/src/crypto_kem/kyber/common/amd64/avx2/poly.jinc b/src/crypto_kem/kyber/common/amd64/avx2/poly.jinc
index 1241817f..d739c8ed 100644
--- a/src/crypto_kem/kyber/common/amd64/avx2/poly.jinc
+++ b/src/crypto_kem/kyber/common/amd64/avx2/poly.jinc
@@ -208,11 +208,7 @@ fn _poly_compress(reg u64 rp, reg ptr u16[KYBER_N] a) -> reg ptr u16[KYBER_N]
 {
   inline int i;
   reg u256 f0 f1 f2 f3 v shift1 mask shift2 permidx;
-  reg u128 t0 t1 t3;
   reg ptr u16[16] x16p;
-  reg u64 t64;
-  reg u32 t32;
-  reg u16 t16;
 
   a = _poly_csubq(a);
 
@@ -258,11 +254,7 @@ fn _poly_compress_1(reg ptr u8[KYBER_POLYCOMPRESSEDBYTES] rp, reg ptr u16[KYBER_
 {
   inline int i;
   reg u256 f0 f1 f2 f3 v shift1 mask shift2 permidx;
-  reg u128 t0 t1 t3;
   reg ptr u16[16] x16p;
-  reg u64 t64;
-  reg u32 t32;
-  reg u16 t16;
 
   a = _poly_csubq(a);
 
@@ -451,7 +443,7 @@ u8[16] pfm_idx_s = {0, 1, 4, 5, 8, 9, 12, 13,
 fn _poly_frommsg(reg ptr u16[KYBER_N] rp, reg u64 ap) -> stack u16[KYBER_N]
 {
   inline int i;
-  reg u256 f g0 g1 g2 g3 g4 h0 h1 h2 h3;
+  reg u256 f g0 g1 g2 g3 h0 h1 h2 h3;
   reg u256 shift idx hqs;
   reg ptr u16[16] x16p;
 
@@ -499,7 +491,7 @@ fn _poly_frommsg(reg ptr u16[KYBER_N] rp, reg u64 ap) -> stack u16[KYBER_N]
 fn _poly_frommsg_1(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_INDCPA_MSGBYTES] ap) -> stack u16[KYBER_N]
 {
   inline int i;
-  reg u256 f g0 g1 g2 g3 g4 h0 h1 h2 h3;
+  reg u256 f g0 g1 g2 g3 h0 h1 h2 h3;
   reg u256 shift idx hqs;
   reg ptr u16[16] x16p;
 
@@ -879,7 +871,7 @@ inline
 fn __invntt___butterfly64x(reg u256 rl0 rl1 rl2 rl3 rh0 rh1 rh2 rh3 zl0 zl1 zh0 zh1 qx16) 
     -> reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256
 {
-  reg u256 t0 t1 t2 t3 t4 t5 t6 t7;
+  reg u256 t0 t1 t2 t3;
 
   t0  = #VPSUB_16u16(rl0, rh0);
   t1  = #VPSUB_16u16(rl1, rh1);
@@ -921,7 +913,6 @@ fn _poly_invntt(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N]
 {
   reg u256 zeta0 zeta1 zeta2 zeta3 r0 r1 r2 r3 r4 r5 r6 r7 qx16 vx16 flox16 fhix16;
   reg ptr u16[400] zetasp;
-  reg ptr u16[16] qx16p;
   inline int i;
 
   zetasp = jzetas_inv_exp;
@@ -1119,8 +1110,6 @@ fn __butterfly64x(reg u256 rl0 rl1 rl2 rl3 rh0 rh1 rh2 rh3 zl0 zl1 zh0 zh1 qx16)
 fn _poly_ntt(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N]
 {
   reg u256 zeta0 zeta1 zeta2 zeta3 r0 r1 r2 r3 r4 r5 r6 r7 qx16 vx16;
-  reg u32 t;
-  reg u16 w;
   reg ptr u16[400] zetasp;
   inline int i;
 
diff --git a/src/crypto_kem/kyber/common/amd64/avx2/polyvec.jinc b/src/crypto_kem/kyber/common/amd64/avx2/polyvec.jinc
index ee2d6ab8..e4f288fc 100644
--- a/src/crypto_kem/kyber/common/amd64/avx2/polyvec.jinc
+++ b/src/crypto_kem/kyber/common/amd64/avx2/polyvec.jinc
@@ -80,7 +80,6 @@ fn __polyvec_compress(reg u64 rp, stack u16[KYBER_VECN] a)
   reg u256 f0 f1 f2 v v8 off shift1 mask shift2 sllvdidx shufbidx;
   reg u128 t0 t1;
   reg ptr u16[16] x16p;
-  reg ptr u8[32] x8p;
 
   a = __polyvec_csubq(a);
 
@@ -127,7 +126,6 @@ fn __polyvec_compress_1(reg ptr u8[KYBER_POLYVECCOMPRESSEDBYTES] rp, stack u16[K
   reg u256 f0 f1 f2 v v8 off shift1 mask shift2 sllvdidx shufbidx;
   reg u128 t0 t1;
   reg ptr u16[16] x16p;
-  reg ptr u8[32] x8p;
 
   a = __polyvec_csubq(a);
 
diff --git a/src/crypto_kem/kyber/common/amd64/kem.jinc b/src/crypto_kem/kyber/common/amd64/kem.jinc
index 62f1e4b8..ea240165 100644
--- a/src/crypto_kem/kyber/common/amd64/kem.jinc
+++ b/src/crypto_kem/kyber/common/amd64/kem.jinc
@@ -50,7 +50,6 @@ fn __crypto_kem_enc_derand_jazz(reg u64 ctp, reg u64 shkp, reg u64 pkp, reg ptr
   stack u8[KYBER_SYMBYTES * 2] buf kr;
   stack u64 s_pkp s_ctp s_shkp;
   reg u64 t64;
-  inline int i;
 
   s_pkp = pkp;
   s_ctp = ctp;
diff --git a/src/crypto_kem/kyber/common/amd64/ref/poly.jinc b/src/crypto_kem/kyber/common/amd64/ref/poly.jinc
index 3978eaec..5c40ec54 100644
--- a/src/crypto_kem/kyber/common/amd64/ref/poly.jinc
+++ b/src/crypto_kem/kyber/common/amd64/ref/poly.jinc
@@ -45,7 +45,6 @@ fn _poly_csubq(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N]
 
 fn _poly_basemul(reg ptr u16[KYBER_N] rp, reg const ptr u16[KYBER_N] ap bp) -> reg ptr u16[KYBER_N]
 {
-  reg u64 offset;
   reg u16 zeta;
   reg u16 r0;
   reg u16 r1;
@@ -296,7 +295,6 @@ fn _poly_frommsg(reg ptr u16[KYBER_N] rp, reg u64 ap) -> stack u16[KYBER_N]
   reg u8 c;
   reg u16 t;
   inline int i;
-  inline int j;
 
   for i = 0 to KYBER_INDCPA_MSGBYTES
   {
@@ -359,7 +357,6 @@ fn _i_poly_frommsg(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_INDCPA_MSGBYTES] ap
   reg u8 c;
   reg u16 t;
   inline int i;
-  inline int j;
 
   for i = 0 to KYBER_INDCPA_MSGBYTES
   {
diff --git a/src/crypto_kem/kyber/common/amd64/ref/polyvec.jinc b/src/crypto_kem/kyber/common/amd64/ref/polyvec.jinc
index e1aee308..ea7c31d8 100644
--- a/src/crypto_kem/kyber/common/amd64/ref/polyvec.jinc
+++ b/src/crypto_kem/kyber/common/amd64/ref/polyvec.jinc
@@ -31,7 +31,6 @@ fn __polyvec_compress(reg u64 rp, stack u16[KYBER_VECN] a)
 {
   stack u16[KYBER_VECN] aa;
   reg u16 c, b;
-  reg u16 d;
   reg u64[4] t;
   reg u64 i j;
   inline int k;
@@ -95,7 +94,6 @@ fn __i_polyvec_compress(reg ptr u8[KYBER_POLYVECCOMPRESSEDBYTES] rp, stack u16[K
 {
   stack u16[KYBER_VECN] aa;
   reg u16 c, b;
-  reg u16 d;
   reg u64[4] t;
   reg u64 i j;
   inline int k;
diff --git a/src/crypto_kem/kyber/common/amd64/ref/verify.jinc b/src/crypto_kem/kyber/common/amd64/ref/verify.jinc
index ad521ce9..effce7b6 100644
--- a/src/crypto_kem/kyber/common/amd64/ref/verify.jinc
+++ b/src/crypto_kem/kyber/common/amd64/ref/verify.jinc
@@ -25,7 +25,7 @@ fn __verify(reg u64 ctp, reg ptr u8[KYBER_INDCPA_BYTES] ctpc) -> reg u64
 inline
 fn __cmov(reg ptr u8[KYBER_SYMBYTES] dst, reg u64 src cnd) -> reg ptr u8[KYBER_SYMBYTES]
 {
-  reg u8 t1 t2 bcond;
+  reg u8 t1 t2;
   inline int i;
 
   cnd = -cnd;
diff --git a/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc b/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc
index c1bb634b..8c76ab6c 100644
--- a/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc
@@ -12,7 +12,7 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES]
   stack u8[64] buf;
   stack u8[KYBER_SYMBYTES] publicseed noiseseed;
   reg u64 t64;
-  reg u8 nonce, c;
+  reg u8 nonce;
   inline int i;
 
   spkp = pkp;
diff --git a/src/crypto_kem/kyber/kyber768/amd64/avx2/gen_matrix.jinc b/src/crypto_kem/kyber/kyber768/amd64/avx2/gen_matrix.jinc
index 9ae8a167..afbe3819 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/avx2/gen_matrix.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/avx2/gen_matrix.jinc
@@ -575,7 +575,6 @@ fn __gen_matrix(stack u8[KYBER_SYMBYTES] seed, inline int transposed) -> stack u
   stack u256 fs;
   reg u256 f;
   reg u64 ctr0 ctr1 ctr2 ctr3 tmp;
-  stack u64 ctr0_s;
   reg u8 flg0 flg1 bflg;
   reg bool b;
   reg bool zf;
diff --git a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
index 34c8982f..b8581bd5 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
@@ -92,11 +92,10 @@ fn __indcpa_enc(stack u64 sctp, reg ptr u8[32] msgp, reg u64 pkp, reg ptr u8[KYB
 {
   stack u16[KYBER_VECN] pkpv sp ep bp;
   stack u16[KYBER_K*KYBER_VECN] aat;
-  stack u16[KYBER_N] k poly epp v poly0 poly1 poly2;
+  stack u16[KYBER_N] k epp v;
   stack u8[KYBER_SYMBYTES] publicseed;
-  reg u64 i j t64;
+  reg u64 i t64;
   reg u64 ctp;
-  reg u16 t;
   reg u8 nonce;
   stack ptr u8[KYBER_SYMBYTES] noiseseed_s;
 
@@ -175,10 +174,9 @@ fn __iindcpa_enc(reg ptr u8[KYBER_CT_LEN] ctp, reg ptr u8[32] msgp, reg u64 pkp,
 {
   stack u16[KYBER_VECN] pkpv sp ep bp;
   stack u16[KYBER_K*KYBER_VECN] aat;
-  stack u16[KYBER_N] k poly epp v poly0 poly1 poly2;
+  stack u16[KYBER_N] k epp v;
   stack u8[KYBER_SYMBYTES] publicseed;
-  reg u64 i j t64;
-  reg u16 t;
+  reg u64 i t64;
   reg u8 nonce;
   stack ptr u8[KYBER_CT_LEN] sctp;
   stack ptr u8[KYBER_SYMBYTES] noiseseed_s;
diff --git a/src/crypto_kem/kyber/kyber768/amd64/ref/poly.jinc b/src/crypto_kem/kyber/kyber768/amd64/ref/poly.jinc
index b36b9033..8befa13d 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/ref/poly.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/ref/poly.jinc
@@ -59,7 +59,6 @@ fn _poly_csubq(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N]
 
 fn _poly_basemul(reg ptr u16[KYBER_N] rp, reg const ptr u16[KYBER_N] ap bp) -> reg ptr u16[KYBER_N]
 {
-  reg u64 offset;
   reg u16 zeta;
   reg u16 r0;
   reg u16 r1;
@@ -316,7 +315,6 @@ fn _poly_frommsg(reg ptr u16[KYBER_N] rp, reg u64 ap) -> stack u16[KYBER_N]
   reg u8 c;
   reg u16 t;
   inline int i;
-  inline int j;
 
   for i = 0 to 32
   {
@@ -379,7 +377,6 @@ fn _i_poly_frommsg(reg ptr u16[KYBER_N] rp, reg ptr u8[32] ap) -> stack u16[KYBE
   reg u8 c;
   reg u16 t;
   inline int i;
-  inline int j;
 
   for i = 0 to 32
   {
@@ -441,7 +438,6 @@ fn _poly_getnoise(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_SYMBYTES] seed, reg
 {
   stack u8[33] extseed;   /* 33 = KYBER_SYMBYTES +1 */
   stack u8[128] buf;      /* 128 = KYBER_ETA*KYBER_N/4 */
-  reg u64 outlen;
   reg u8 c,a,b;
   reg u16 t;
   reg u64 i j;
diff --git a/src/crypto_kem/kyber/kyber768/amd64/ref/polyvec.jinc b/src/crypto_kem/kyber/kyber768/amd64/ref/polyvec.jinc
index a9b3fec9..5025f146 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/ref/polyvec.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/ref/polyvec.jinc
@@ -25,7 +25,6 @@ fn __polyvec_compress(reg u64 rp, stack u16[KYBER_VECN] a)
 {
   stack u16[KYBER_VECN] aa;
   reg u16 c, b;
-  reg u16 d;
   reg u64[4] t;
   reg u64 i j;
   inline int k;
@@ -90,7 +89,6 @@ fn __i_polyvec_compress(reg ptr u8[KYBER_POLYVECCOMPRESSEDBYTES] rp, stack u16[K
 {
   stack u16[KYBER_VECN] aa;
   reg u16 c, b;
-  reg u16 d;
   reg u64[4] t;
   reg u64 i j;
   inline int k;
diff --git a/src/crypto_kem/kyber/kyber768/amd64/ref/verify.jinc b/src/crypto_kem/kyber/kyber768/amd64/ref/verify.jinc
index 986916c8..5c2746ee 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/ref/verify.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/ref/verify.jinc
@@ -30,7 +30,7 @@ fn __verify(reg u64 ctp, reg ptr u8[KYBER_CT_LEN] ctpc) -> reg u64
 inline
 fn __cmov(reg ptr u8[KYBER_SYMBYTES] dst, reg u64 src cnd) -> reg ptr u8[KYBER_SYMBYTES]
 {
-  reg u8 t1 t2 bcond;
+  reg u8 t1 t2;
   inline int i;
 
   cnd = -cnd;
diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz b/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz
index 50d2d533..41043499 100644
--- a/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz
+++ b/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz
@@ -4,7 +4,6 @@ require "curve25519.jinc"
 export fn jade_scalarmult_curve25519_amd64_ref5(#spill_to_mmx reg u64 qp np pp) -> reg u64
 {
   reg u64 r;
-  stack u64 qps;
   reg u64[4] q n p;
 
   _ = #init_msf();
@@ -25,7 +24,6 @@ export fn jade_scalarmult_curve25519_amd64_ref5(#spill_to_mmx reg u64 qp np pp)
 export fn jade_scalarmult_curve25519_amd64_ref5_base(#spill_to_mmx reg u64 qp np) -> reg u64
 {
   reg u64 r;
-  stack u64 qps;
   reg u64[4] q n;
 
   _ = #init_msf();
diff --git a/src/crypto_sign/dilithium/common/amd64/avx2/expandA.jinc b/src/crypto_sign/dilithium/common/amd64/avx2/expandA.jinc
index 6d5d9af7..06f42bb1 100644
--- a/src/crypto_sign/dilithium/common/amd64/avx2/expandA.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/avx2/expandA.jinc
@@ -109,7 +109,6 @@ fn expandA_chunk(
 	
 	stack u64[4] coeffs_filled;
 	stack u64[4] xof_offset;
-	reg u64 lane;
 
 	reg u256 v256 v256_zero;
 	reg u64 v64;
diff --git a/src/crypto_sign/dilithium/common/amd64/avx2/expandA_end.jinc b/src/crypto_sign/dilithium/common/amd64/avx2/expandA_end.jinc
index 8c190c71..ee91634d 100644
--- a/src/crypto_sign/dilithium/common/amd64/avx2/expandA_end.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/avx2/expandA_end.jinc
@@ -8,8 +8,6 @@ fn expandA_aligned(stack u8[32] rho) -> stack u32[Li2_k * Li2_l * Li2_polydeg]
 	stack u32[Li2_k * Li2_l * Li2_polydeg] matrix;
 
 	stack u64[4] nonces;
-	stack u64[4] coeffs_left;
-	stack u64[4] xof_bytes_left;
 	
 	inline int row col chunk idx lane;
 
@@ -62,8 +60,6 @@ fn expandA_unaligned2(stack u8[32] rho) -> stack u32[Li2_k * Li2_l * Li2_polydeg
 	stack u32[Li2_polydeg] scratch0 scratch1;
 
 	stack u64[4] nonces;
-	stack u64[4] coeffs_left;
-	stack u64[4] xof_bytes_left;
 	
 	inline int row col chunk idx lane;
 
diff --git a/src/crypto_sign/dilithium/common/amd64/avx2/expandMask.jinc b/src/crypto_sign/dilithium/common/amd64/avx2/expandMask.jinc
index 7a53144e..177c067e 100644
--- a/src/crypto_sign/dilithium/common/amd64/avx2/expandMask.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/avx2/expandMask.jinc
@@ -105,8 +105,6 @@ fn expandMask_poly_gamma1_217_4x(
 	stack u64 y_packed_filled;
 	stack u64 output_squeeze_counter;
 	stack u256[25] xof;	
-	stack u64[4] coeffs_filled;
-	stack u64[4] xof_offset;
 
 	reg u64 addr v64;
 	reg u64 i j;
@@ -182,8 +180,6 @@ fn expandMask_poly_gamma1_219_4x(
 	stack u64 y_packed_filled;
 	stack u64 output_squeeze_counter;
 	stack u256[25] xof;	
-	stack u64[4] coeffs_filled;
-	stack u64[4] xof_offset;
 
 	reg u64 addr v64;
 	reg u64 i j;
diff --git a/src/crypto_sign/dilithium/common/amd64/avx2/expandMask_end.jinc b/src/crypto_sign/dilithium/common/amd64/avx2/expandMask_end.jinc
index 96ba1d4c..6ea6f10c 100644
--- a/src/crypto_sign/dilithium/common/amd64/avx2/expandMask_end.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/avx2/expandMask_end.jinc
@@ -13,8 +13,6 @@ fn expandMask_buffered(
 
 	reg u32 v32;
 
-	reg ptr u32[Li2_polydeg] poly;
-
 	?{}, polys_generated = #set0_64();
 	while (polys_generated < Li2_l) {
 		if buffer_offset >= 4 {
diff --git a/src/crypto_sign/dilithium/common/amd64/avx2/ntt.jinc b/src/crypto_sign/dilithium/common/amd64/avx2/ntt.jinc
index 07db8d35..1bbf7c80 100644
--- a/src/crypto_sign/dilithium/common/amd64/avx2/ntt.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/avx2/ntt.jinc
@@ -217,9 +217,7 @@ fn ntt_levels0t1(reg ptr u32[256] poly_ptr, reg u256 q, inline int offset)
 	// TODO: Interleave loads/stores with arithmetic ops
 
 	reg u256 zeta_qinv zeta;
-	reg u256 poly0 poly1 poly2 poly3 poly4 poly5 poly6 poly7 poly8;
-
-	inline int ii;
+	reg u256 poly0 poly1 poly2 poly3 poly4 poly5 poly6 poly7;
 
 	poly0 = #VMOVDQU_256(poly_ptr.[u256 (32 * (0*4 + offset))]);
 	poly1 = #VMOVDQU_256(poly_ptr.[u256 (32 * (1*4 + offset))]);
@@ -267,8 +265,7 @@ fn ntt_levels2t7(reg ptr u32[256] poly_ptr, reg u256 q, inline int offset)
 	// TODO: Interleave shuffles with butterflies
 
 	reg u256 zeta_qinv0 zeta_qinv1 zeta0 zeta1;
-	reg u256[8] poly;
-	reg u256 poly0 poly1 poly2 poly3 poly4 poly5 poly6 poly7 poly8 polyx;
+	reg u256 poly0 poly1 poly2 poly3 poly4 poly5 poly6 poly7 polyx;
 		
 	poly0 = #VMOVDQU_256(poly_ptr.[u256 32 * (8*offset + 0)]);
 	poly1 = #VMOVDQU_256(poly_ptr.[u256 32 * (8*offset + 1)]);
diff --git a/src/crypto_sign/dilithium/common/amd64/expandS.jinc b/src/crypto_sign/dilithium/common/amd64/expandS.jinc
index 2d9355f1..9667f528 100644
--- a/src/crypto_sign/dilithium/common/amd64/expandS.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/expandS.jinc
@@ -12,7 +12,6 @@ fn expandSEta2_poly(stack u8[64] rho_prime, reg u16 elem_idx, reg ptr u32[Li2_po
 
 	// temps
 	reg u64 i;
-	reg u64 addr;
 	
 	reg u8 c;
 	reg u32 c32 t0 t1;
@@ -89,10 +88,8 @@ fn expandSEta4_poly(stack u8[64] rho_prime, reg u16 elem_idx, reg ptr u32[Li2_po
 
 	// temps
 	reg u64 i;
-	reg u8 c1 c2;
-	reg u32 u32_c1;
+	reg u8 c1;
 	reg u32 v;
-	reg u64 addr;
 	
 	reg u8 c;
 	reg u32 c32;
@@ -144,4 +141,4 @@ fn expandSEta4_poly(stack u8[64] rho_prime, reg u16 elem_idx, reg ptr u32[Li2_po
 		}
 	}
 	return poly;
-}
\ No newline at end of file
+}
diff --git a/src/crypto_sign/dilithium/common/amd64/fips202.jinc b/src/crypto_sign/dilithium/common/amd64/fips202.jinc
index aeb015ad..3d41a8a5 100644
--- a/src/crypto_sign/dilithium/common/amd64/fips202.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/fips202.jinc
@@ -126,7 +126,6 @@ u64[24] roundconstants = {0x0000000000000001, 0x0000000000008082, 0x800000000000
                           0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
 
 fn __keccakf1600_ref(reg ptr u64[25] state) -> reg ptr u64[25] {
-  inline int round;
   reg ptr u64[24] constptr;
 
   reg u64 rctr;
diff --git a/src/crypto_sign/dilithium/common/amd64/keygen_end.jinc b/src/crypto_sign/dilithium/common/amd64/keygen_end.jinc
index ddf3d3dd..fad51941 100644
--- a/src/crypto_sign/dilithium/common/amd64/keygen_end.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/keygen_end.jinc
@@ -14,7 +14,6 @@ fn keygen_inner(reg ptr u8[32] random_zeta)
 	stack u8[32] rho;
 	reg ptr u8[32] rho_rsp;
 	stack u8[64] rho_prime;
-	reg ptr u8[64] rho_prime_rsp;
 	stack u8[32] k;
 	// FFTs
 	stack u32[Li2_k * Li2_l * Li2_polydeg] fft_matA;
@@ -22,9 +21,6 @@ fn keygen_inner(reg ptr u8[32] random_zeta)
 	reg ptr u32[Li2_l * Li2_polydeg] s1_rsp;
 	stack u32[Li2_k * Li2_polydeg] s2;
 	
-	stack u8[SHAKE256_RATE] s256_out;
-
-	stack u32[Li2_k * Li2_polydeg] s2;
 	stack u32[Li2_k * Li2_polydeg] t t1 t0;
 
 	reg ptr u8[Li2_pack_s1len] s1_in_sk;
@@ -36,7 +32,6 @@ fn keygen_inner(reg ptr u8[32] random_zeta)
 	//temp variables
 	reg u64 i;
 	inline int j;
-	reg u32 v32;
 	reg u8 c;
 
 	state = shake256_absorb32(random_zeta);
diff --git a/src/crypto_sign/dilithium/common/amd64/packing.jinc b/src/crypto_sign/dilithium/common/amd64/packing.jinc
index 07038863..59a78c9e 100644
--- a/src/crypto_sign/dilithium/common/amd64/packing.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/packing.jinc
@@ -19,7 +19,6 @@ fn polyeta_2_pack(reg ptr u32[Li2_polydeg] s1, reg ptr u8[Li2_pack_eta_2_len] de
 	inline int _eta;
 	reg u32 dest0 dest1 dest2;
 	reg u32 t1 t2 t3 t4 t5 t6 t7;
-	reg u32 coeff;
 	reg u64 i addr;
 
 	_eta = 2;
@@ -81,7 +80,6 @@ fn polyeta_4_pack(reg ptr u32[Li2_polydeg] s1, reg ptr u8[Li2_pack_eta_4_len] de
 	-> reg ptr u8[Li2_pack_eta_4_len]
 {
 	inline int _eta;
-	reg u8 value;
 
 	reg u32 lo hi;
 	reg u32 coeff;
@@ -117,7 +115,7 @@ fn polyeta_2_unpack(reg ptr u32[Li2_polydeg] r, reg ptr u8[Li2_pack_eta_2_len] a
 	inline int _eta;
 	reg u64 i;
 	reg u32 a0 a1 a2;
-	reg u32 c32_0 c32_1;
+	reg u32 c32_0;
 	reg u32 coeff;
 	reg u64 dest_addr src_addr;
 
@@ -715,7 +713,6 @@ fn polyt0_pack(reg ptr u32[Li2_polydeg] t0, reg ptr u8[Li2_pack_t0len] dest)
 	reg u64 addr;
 
 	reg u32 v32;
-	stack u32 v32s;
 
 	reg u8 c c1;
 
diff --git a/src/crypto_sign/dilithium/common/amd64/poly.jinc b/src/crypto_sign/dilithium/common/amd64/poly.jinc
index f5c73db1..875bf6a3 100644
--- a/src/crypto_sign/dilithium/common/amd64/poly.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/poly.jinc
@@ -38,9 +38,6 @@ fn poly_subtract(reg ptr u32[Li2_polydeg] f g difference)
 fn poly_accumulate(reg ptr u32[Li2_polydeg] f sum)
 	-> reg ptr u32[Li2_polydeg]
 {
-	reg u32 temp;
-	stack u32 x1 x2 y;
-
 	reg u32 v32 result;
 
 	reg u64 i;
@@ -147,4 +144,4 @@ fn poly_checknorm(reg ptr u32[Li2_polydeg] f, inline int threshold)
 
 	result_s = result;
 	return result_s;
-}
\ No newline at end of file
+}
diff --git a/src/crypto_sign/dilithium/common/amd64/verify_end.jinc b/src/crypto_sign/dilithium/common/amd64/verify_end.jinc
index 7bd0b87a..ebb3ff35 100644
--- a/src/crypto_sign/dilithium/common/amd64/verify_end.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/verify_end.jinc
@@ -23,7 +23,7 @@ fn unpack_hints(reg ptr u8[Li2_omega + Li2_k] hints_buf, reg ptr u32[Li2_k * Li2
     reg u64 i j;
     reg u8 done fail status;
     reg bool tmp;
-    reg u32 zero one;
+    reg u32 zero;
     reg u64 k hints_elem_offset idx idx1 idx2 idxtmp;
     reg u64 hints_cumpop; // cumulative popcount of hints
 
@@ -119,10 +119,9 @@ fn verify_inner(stack ptr u8[Li2_SIGN_LEN] sig, reg u64 m, reg u64 m_len, stack
     reg u32 r_status;
     stack u32 status;
     stack u8 z_normcheck_fail c_tilde_result;
-    reg u64 i j;
+    reg u64 i;
     reg u8 byte;
-    reg u8 k l hints_popcount hints_popcount_fail done;
-    reg u64 hint_index;
+    reg u8 hints_popcount_fail;
 
     reg ptr u8[Li2_SIGN_LEN] sig_rsp;
     reg ptr u8[Li2_PK_LEN] pk_rsp;
@@ -130,7 +129,7 @@ fn verify_inner(stack ptr u8[Li2_SIGN_LEN] sig, reg u64 m, reg u64 m_len, stack
     stack u64[25] keccak_state;
     reg ptr u64[25] keccak_state_rsp;
     stack u8[32] tr c_tilde c_tilde2;
-    reg ptr u8[32] tr_rsp c_tilde_rsp c_tilde2_rsp;
+    reg ptr u8[32] tr_rsp;
     stack u8[64] mu;
 
     stack u32[Li2_k * Li2_l * Li2_polydeg] fft_matA;
@@ -144,8 +143,6 @@ fn verify_inner(stack ptr u8[Li2_SIGN_LEN] sig, reg u64 m, reg u64 m_len, stack
     reg ptr u8[Li2_pack_t1len] t1_buf;
     reg ptr u8[Li2_omega + Li2_k] hints_buf;
 
-    reg u32 coeff;
-       
     inline int ii;
 
     //status = 0xFF;
@@ -230,10 +227,8 @@ inline
 fn verify(reg u64 ptr_sig, reg u64 ptr_m, reg u64 m_len, reg u64 ptr_pk) -> reg u32 {
     stack u8[Li2_PK_LEN] pk;
     reg ptr u8[Li2_PK_LEN] pk_rsp;
-    stack ptr u8[Li2_PK_LEN] pk_ssp;
     stack u8[Li2_SIGN_LEN] sig;
     reg ptr u8[Li2_SIGN_LEN] sig_rsp;
-    stack ptr u8[Li2_SIGN_LEN] sig_ssp;
 
     reg u8 byte;
     reg u64 i;
diff --git a/src/crypto_sign/dilithium/dilithium2/amd64/avx2/common.jinc b/src/crypto_sign/dilithium/dilithium2/amd64/avx2/common.jinc
index a3b6363c..d8f19529 100644
--- a/src/crypto_sign/dilithium/dilithium2/amd64/avx2/common.jinc
+++ b/src/crypto_sign/dilithium/dilithium2/amd64/avx2/common.jinc
@@ -28,9 +28,6 @@ fn use_hint(reg u32 a hint)
 inline
 fn expandMask_poly_4x(reg ptr u32[4 * Li2_polydeg] f_4x, reg ptr u8[64] rho_prime, stack u16 kappa)
 	-> reg ptr u32[4 * Li2_polydeg] {
-	reg u64 i;
-	reg u32 v32;
-
 	f_4x = expandMask_poly_gamma1_217_4x(f_4x, rho_prime, kappa);
 	return f_4x;
 }
@@ -54,4 +51,4 @@ fn expandA(stack u8[32] rho) -> stack u32[Li2_k * Li2_l * Li2_polydeg] {
 	stack u32[Li2_k * Li2_l * Li2_polydeg] matrix;
 	matrix = expandA_aligned(rho);
 	return matrix;
-}
\ No newline at end of file
+}
diff --git a/src/crypto_sign/dilithium/dilithium3/amd64/avx2/common.jinc b/src/crypto_sign/dilithium/dilithium3/amd64/avx2/common.jinc
index e904742e..614e4a78 100644
--- a/src/crypto_sign/dilithium/dilithium3/amd64/avx2/common.jinc
+++ b/src/crypto_sign/dilithium/dilithium3/amd64/avx2/common.jinc
@@ -28,9 +28,6 @@ fn use_hint(reg u32 a hint)
 inline
 fn expandMask_poly_4x(reg ptr u32[4 * Li2_polydeg] f_4x, reg ptr u8[64] rho_prime, stack u16 kappa)
 	-> reg ptr u32[4 * Li2_polydeg] {
-	reg u64 i;
-	reg u32 v32;
-
 	f_4x = expandMask_poly_gamma1_219_4x(f_4x, rho_prime, kappa);
 	return f_4x;
 }
diff --git a/src/crypto_sign/falcon/falcon512/amd64/avx2/test.jazz b/src/crypto_sign/falcon/falcon512/amd64/avx2/test.jazz
index 7349e298..f1892f09 100644
--- a/src/crypto_sign/falcon/falcon512/amd64/avx2/test.jazz
+++ b/src/crypto_sign/falcon/falcon512/amd64/avx2/test.jazz
@@ -10,7 +10,6 @@ fn __decode_public_key_external(reg u64 h pk) -> reg u32 {
     reg u32 failed;
     stack u16[ARRAY_N] h_buff;
     reg u64 i;
-    reg u16 tmp16;
 
     h_buff, failed = __decode_public_key(h_buff, pk);
 

From 65b2a51c18fe9c088895754951e73989380b98b1 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Fri, 19 Apr 2024 19:16:55 +0800
Subject: [PATCH 13/19] remove redundant spill, pass sct check

---
 .../common/amd64/ref/matrix_mul_opt.jinc      | 29 +++++--------------
 .../frodo/common/amd64/ref/shake128_opt.jinc  |  8 ++---
 .../frodo/common/amd64/ref/shake256_opt.jinc  | 16 +++++-----
 .../frodo/frodo640shake/amd64/ref/kem.jazz    | 15 ++++++----
 .../frodo/frodo640shake/amd64/ref/kem.jinc    | 24 +++++++--------
 5 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc
index deca2d84..9ecb91e7 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc
@@ -1,7 +1,7 @@
 // notes: "16" instead of BYTES_SEED_A on purpose
 // compilation should fail if BYTES_SEED_A changes
 
-inline fn __pad_seedA(reg ptr u8[16] seedA) -> reg ptr u8[2 + 16 + 6] {
+inline fn __pad_seedA(reg ptr u8[16] seedA) -> stack u8[2 + 16 + 6] {
   reg u64 i j;
   stack u8[2+16+ 6] s_index_seed_padding;
   reg u8 v;
@@ -31,10 +31,9 @@ fn __AS_plus_E_opt(
   ->
   reg ptr u16[NNBAR]
 {
-  reg ptr u8[2+16+ 6] index_seed_padding;
+  stack u8[2+16+ 6] index_seed_padding;
 
-  stack u16[N * 8] s_A;
-  reg ptr u16[N * 8] A;
+  stack u16[N * 8] A;
   reg u64 A_offset B_offset S_offset;
 
   inline int p;
@@ -50,14 +49,12 @@ fn __AS_plus_E_opt(
     i += 1;
   }
 
-  () = #spill(E, index_seed_padding);
+  () = #spill(E);
 
-  A = s_A;
   i = 0;
   B_offset = 0;
   while( i < N ) {
     () = #spill(B, S);
-    () = #unspill(index_seed_padding);
 
     A_offset = 0;
     j = 0;
@@ -75,7 +72,6 @@ fn __AS_plus_E_opt(
     }
 
     () = #unspill(B, S);
-    () = #spill(index_seed_padding);
 
     j = 0;
     jN = 0;
@@ -128,10 +124,9 @@ fn __SA_plus_E_opt(
   ->
   reg ptr u16[NNBAR]
 {
-  reg ptr u8[2+16+ 6] index_seed_padding;
+  stack u8[2+16+ 6] index_seed_padding;
 
-  stack u16[N * 8] s_A;
-  reg ptr u16[N * 8] A;
+  stack u16[N * 8] A;
   reg u64 A_offset B_offset S_offset;
 
   inline int p;
@@ -141,23 +136,16 @@ fn __SA_plus_E_opt(
 
   index_seed_padding = __pad_seedA(seedA);
 
-  () = #spill(index_seed_padding);
-
-  A = s_A;
   i = 0;
   while( i < N )
   {
-
-    () = #spill(B, S);
-    () = #unspill(index_seed_padding);
-
-    //
     A_offset = 0;
     j = 0;
+    () = #spill(B, S);
+
     while( j < 8 )
     {
       ij = #LEA(i + j);
-
       () = #spill(i, j);
 
       index_seed_padding[u16 0] = (16u) ij;
@@ -168,7 +156,6 @@ fn __SA_plus_E_opt(
     }
 
     () = #unspill(B, S);
-    () = #spill(index_seed_padding);
 
     //
     j = 0;
diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
index 320c29af..d00a2f52 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
@@ -233,11 +233,11 @@ fn __shake128_pkh_opt(
         j += 1;
     }
 
-    () = #spill(i, j);
+    () = #spill(i, j, in);
 
     state = __keccakf1600_ref1(state);
 
-    () = #unspill(i, j);
+    () = #unspill(i, j, in);
 
     i += SHAKE128_RATE/8;
   }
@@ -438,11 +438,11 @@ fn __shake128_ss_opt(
         j += 1;
     }
 
-    () = #spill(i, j);
+    () = #spill(i, j, in);
 
     state = __keccakf1600_ref1(state);
 
-    () = #unspill(i, j);
+    () = #unspill(i, j, in);
 
     i += SHAKE128_RATE/8;
   }
diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc
index 177ee356..f89b82eb 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc
@@ -89,11 +89,11 @@ fn __shake256_r_opt(
 
   i = 0;
   while (i < OUTRND * SHAKE256_RATE/8) {
-    () = #spill(i, j, out);
+    () = #spill(i, out);
 
     state = __keccakf1600_ref1(state);
 
-    () = #unspill(i, j, out);
+    () = #unspill(i, out);
 
     j = 0;
     while (j < SHAKE256_RATE/8) {
@@ -107,11 +107,11 @@ fn __shake256_r_opt(
     i += SHAKE256_RATE/8;
   }
 
-  () = #spill(i, j, out);
+  () = #spill(i, out);
 
   state = __keccakf1600_ref1(state);
 
-  () = #unspill(i, j, out);
+  () = #unspill(i, out);
 
   i = 0;
   while (i < (OUTLEN % SHAKE256_RATE) / 8) {
@@ -160,11 +160,11 @@ fn __shake256_pkh_opt(
         j += 1;
     }
 
-    () = #spill(i, j);
+    () = #spill(i, j, in);
 
     state = __keccakf1600_ref1(state);
 
-    () = #unspill(i, j);
+    () = #unspill(i, j, in);
 
     i += SHAKE256_RATE/8;
   }
@@ -365,11 +365,11 @@ fn __shake256_ss_opt(
         j += 1;
     }
 
-    () = #spill(i, j);
+    () = #spill(i, j, in);
 
     state = __keccakf1600_ref1(state);
 
-    () = #unspill(i, j);
+    () = #unspill(i, j, in);
 
     i += SHAKE256_RATE/8;
   }
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz
index b6dd27f1..82fd4de5 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz
@@ -1,36 +1,41 @@
 from Jade require "crypto_kem/frodo/common/frodo640_params.jinc"
 from Jade require "crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc"
 
-export fn jade_kem_frodo_frodo640shake_amd64_ref_keypair_derand(#public reg u64 pkp skp coinsp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo640shake_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_keypair_derand(pkp, skp, coinsp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo640shake_amd64_ref_keypair(#public reg u64 pkp skp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo640shake_amd64_ref_keypair(reg u64 pkp skp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_keypair(pkp, skp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo640shake_amd64_ref_enc_derand(#public reg u64 ctp ssp pkp coinsp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo640shake_amd64_ref_enc_derand(reg u64 ctp ssp pkp coinsp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coinsp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo640shake_amd64_ref_enc(#public reg u64 ctp ssp pkp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo640shake_amd64_ref_enc(reg u64 ctp ssp pkp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_enc(ctp, ssp, pkp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo640shake_amd64_ref_dec(#public reg u64 ssp ctp skp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo640shake_amd64_ref_dec(reg u64 ssp ctp skp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_dec(ssp, ctp, skp);
     ?{}, r = #set0();
     return r;
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
index 922c5a50..6568a3a4 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
@@ -21,28 +21,27 @@ fn __frodo_amd64_ref_keypair_derand(
     // s || seedA || b || S_T || pkh
     stack u8[BYTES_SK] sk;
 
-    () = #spill(i, j, pkp, skp);
+    () = #spill(pkp, skp);
 
     for k = 0 to BYTES_SEC/8 {
         sk[u64 k] = coins[u64 k];
     }
 
     // gen seedA
+    () = #spill(coins); // stack_coins = coins
     pk[0:BYTES_SEED_A] = __shake128_seed_A_opt(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]);
 
     // gen S || E
+    () = #unspill(coins); // coins = stack_coins
     SE = __shake128_r_opt(SE, coins[BYTES_SEC:BYTES_SEED_SE]);
     SE = __sample_2NNBAR(SE);
 
-    () = #spill(coins);
-
     // B = A*S+E
     B = __AS_plus_E_opt(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]);
 
     // pack
     pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B);
 
-    () = #unspill(i);
     i = 0;
     while (i < BYTES_PK/8) {
         sk[u64 BYTES_SEC/8 + i] = pk[u64 i];
@@ -54,11 +53,11 @@ fn __frodo_amd64_ref_keypair_derand(
         sk[u64 BYTES_SEC/8 + BYTES_PK/8 + i] = SE[u64 i];
         i += 1;
     }
-    () = #spill(i);
 
     sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake128_pkh_opt(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk);
 
-    () = #unspill(i, j, pkp, skp);
+    () = #unspill(pkp, skp);
+    _ = #init_msf();
     i = 0; j = 0;
     while (i < BYTES_PK/8) {
         [pkp + j] = pk[u64 i];
@@ -102,9 +101,6 @@ fn __frodo_amd64_ref_enc_derand(
     reg ptr u16[NBAR * NBAR] V;
     stack u8[BYTES_SEC] ss;
 
-    pkp = pkp;
-    () = #spill(ctp, ssp, i, j);
-
     // gen u || salt
     for k = 0 to (BYTES_SEC + BYTES_SALT)/8 {
         pkh_u_salt[u64 BYTES_SEC/8 + k] = coins[u64 k];
@@ -114,7 +110,6 @@ fn __frodo_amd64_ref_enc_derand(
         ct_k[u64 (D * N + D * NBAR)/8 + k] = pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k];
     }
 
-    () = #unspill(i, j);
     // read pk
     i = 0; j = 0;
     while (i < BYTES_PK/8) {
@@ -122,7 +117,8 @@ fn __frodo_amd64_ref_enc_derand(
         i += 1;
         j += 8;
     }
-    () = #spill(i, j);
+
+    () = #spill(ctp, ssp, i, j);
 
     // pkh
     pkh_u_salt[0:BYTES_SEC] = __shake128_pkh_opt(pkh_u_salt[0:BYTES_SEC], pk);
@@ -145,6 +141,7 @@ fn __frodo_amd64_ref_enc_derand(
 
     // B' = S'A + E''
     Bp = SEE[NNBAR:NNBAR];
+
     Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]);
 
     // c1 <- Pack(B')
@@ -169,6 +166,7 @@ fn __frodo_amd64_ref_enc_derand(
 
     () = #unspill(i, j, ctp, ssp);
     i = 0; j = 0;
+    _ = #init_msf();
     while (i < BYTES_CT/8) {
         [ctp + j] = ct_k[u64 i];
         i += 1;
@@ -296,6 +294,7 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) {
 
     ss = __shake128_ss_opt(ss, ct_k);
 
+    _ = #init_msf();
     ssp = s_ssp;
     for k = 0 to BYTES_SEC/8 {
         [ssp + 8*k] = ss[u64 k];
@@ -315,7 +314,7 @@ fn _frodo_amd64_ref_keypair(reg u64 pkp skp) {
 
 fn _frodo_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) {
     #public stack u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins;
-    reg u64 i; stack u64 s_i;
+    reg u64 i;
 
     pkp = pkp;
     skp = skp;
@@ -326,7 +325,6 @@ fn _frodo_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) {
         i += 1;
     }
 
-    s_i = i;
     __frodo_amd64_ref_keypair_derand(pkp, skp, coins);
 }
 

From 11523744d7d782e58442156487e95164d2f39d68 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:34:01 +0800
Subject: [PATCH 14/19] indcpa keypair

---
 .../frodo/frodo640shake/amd64/ref/indcpa.jinc | 38 +++++++++++
 .../frodo/frodo640shake/amd64/ref/kem.jinc    | 66 +++++++++----------
 2 files changed, 68 insertions(+), 36 deletions(-)
 create mode 100644 src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc

diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
new file mode 100644
index 00000000..60649cab
--- /dev/null
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
@@ -0,0 +1,38 @@
+
+inline
+fn __indcpa_keypair_derand(
+  #spill_to_mmx reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE] coins
+) -> stack u8[BYTES_PK], stack u8[2*NNBAR] {
+    stack u8[BYTES_PK] pk; // seedA || b
+    stack u8[2*NNBAR] sk; // S_T
+    stack u16[2 * NNBAR] SE;
+    stack u16[NNBAR] B;
+
+    reg u64 i;
+
+    i = 0;
+    while (i < BYTES_SEED_A/8) {
+        pk[u64 i] = coins[u64 i];
+        i += 1;
+    }
+
+    () = #spill(coins);
+    // gen S || E
+    SE = __shake128_r_opt(SE, coins[BYTES_SEED_A:BYTES_SEED_SE]);
+
+    SE = __sample_2NNBAR(SE);
+
+    // B = A*S+E
+    B = __AS_plus_E_opt(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]);
+
+    // pack
+    pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B);
+
+    i = 0;
+    while (i < 2 * NNBAR / 8) {
+        sk[u64 i] = SE[u64 i];
+        i += 1;
+    }
+
+    return pk, sk;
+}
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
index 6568a3a4..63172e6a 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
@@ -4,72 +4,66 @@ from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc"
 from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc"
 from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc"
 from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc"
+require "./indcpa.jinc"
 
 // coins = s || seed SE || z
+inline
 fn __frodo_amd64_ref_keypair_derand(
   reg u64 pkp skp,
-  #spill_to_mmx reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) {
-    stack u16[2 * NNBAR] SE;
-    stack u16[NNBAR] B;
-
-    inline int k;
-    reg u64 i j;
+  #spill_to_mmx reg ptr u8[BYTES_SEC + BYTES_SEED_SE + BYTES_SEED_A] coins) {
+    reg u64 i;
 
     // seedA || b
     stack u8[BYTES_PK] pk;
+    stack u8[BYTES_SEED_A + BYTES_SEED_SE] indcoins;
+    stack u8[BYTES_SEC] pkh;
 
-    // s || seedA || b || S_T || pkh
-    stack u8[BYTES_SK] sk;
-
-    () = #spill(pkp, skp);
+    // S_T
+    stack u8[2*NNBAR] sk;
 
-    for k = 0 to BYTES_SEC/8 {
-        sk[u64 k] = coins[u64 k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        [skp + i*8] = coins[u64 i];
+        i += 1;
     }
 
-    // gen seedA
-    () = #spill(coins); // stack_coins = coins
-    pk[0:BYTES_SEED_A] = __shake128_seed_A_opt(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]);
+    // copy seedSE
+    i = 0;
+    while (i < BYTES_SEED_SE/8) {
+        indcoins[u64 BYTES_SEED_A/8 + i] = coins[u64 BYTES_SEC/8 + i];
+        i += 1;
+    }
 
-    // gen S || E
-    () = #unspill(coins); // coins = stack_coins
-    SE = __shake128_r_opt(SE, coins[BYTES_SEC:BYTES_SEED_SE]);
-    SE = __sample_2NNBAR(SE);
+    () = #spill(pkp, skp, coins);
+    indcoins[0:BYTES_SEED_A] = __shake128_seed_A_opt(indcoins[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]);
 
-    // B = A*S+E
-    B = __AS_plus_E_opt(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]);
+    pk, sk = __indcpa_keypair_derand(indcoins);
+    pkh = __shake128_pkh_opt(pkh, pk);
 
-    // pack
-    pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B);
+    () = #unspill(pkp, skp);
 
     i = 0;
     while (i < BYTES_PK/8) {
-        sk[u64 BYTES_SEC/8 + i] = pk[u64 i];
+        [skp + BYTES_SEC + i*8] = pk[u64 i];
         i += 1;
     }
 
     i = 0;
     while (i < 2 * NNBAR / 8) {
-        sk[u64 BYTES_SEC/8 + BYTES_PK/8 + i] = SE[u64 i];
+        [skp + BYTES_SEC + BYTES_PK + i*8] = sk[u64 i];
         i += 1;
     }
 
-    sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake128_pkh_opt(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk);
-
-    () = #unspill(pkp, skp);
-    _ = #init_msf();
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_PK/8) {
-        [pkp + j] = pk[u64 i];
+        [pkp + i*8] = pk[u64 i];
         i += 1;
-        j += 8;
     }
 
-    i = 0; j = 0;
-    while (i < BYTES_SK/8) {
-        [skp + j] = sk[u64 i];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        [skp + BYTES_SK - BYTES_SEC + i*8] = pkh[u64 i];
         i += 1;
-        j += 8;
     }
 }
 

From 97cc255d3bdd62be9d0e8c7751f1886e4c83b7f7 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Mon, 29 Apr 2024 18:56:49 +0800
Subject: [PATCH 15/19] incpa enc

---
 .../frodo/common/amd64/ref/shake128_opt.jinc  |  51 ++++++++
 .../frodo/frodo640shake/amd64/ref/indcpa.jinc |  73 +++++++++++-
 .../frodo/frodo640shake/amd64/ref/kem.jinc    | 110 +++++++-----------
 3 files changed, 166 insertions(+), 68 deletions(-)

diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
index d00a2f52..a7d481f7 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
@@ -268,6 +268,57 @@ fn __shake128_pkh_opt(
   return out;
 }
 
+fn __shake128_SE_k_opt2(
+  #spill_to_mmx reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out,
+  #spill_to_mmx reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) 
+-> reg ptr u8[BYTES_SEED_SE + BYTES_SEC] {
+  #spill_to_mmx reg u64 i;
+
+  stack u64[25] s_state;
+  reg ptr u64[25] state;
+  reg u64 t0 zero;
+  inline int INLEN OUTLEN;
+
+  INLEN = 2 * BYTES_SEC + BYTES_SALT;
+  OUTLEN = BYTES_SEED_SE + BYTES_SEC;
+
+  state = s_state;
+
+  i = 0;
+  while (i < INLEN/8) {
+    t0 = in[u64 i];
+    state[i] = t0;
+
+    i += 1;
+  }
+  ?{}, zero = #set0();
+
+  i = INLEN/8;
+  while (i < 25) {
+    state[i] = zero;
+    i += 1;
+  }
+
+  state[u8 INLEN] = 0x1f;
+  state[u8 SHAKE128_RATE-1] = 0x80;
+
+  () = #spill(out);
+
+  state = __keccakf1600_ref1(state);
+
+  () = #unspill(out);
+
+  i = 0;
+  while (i < OUTLEN/8) {
+    t0 = state[u64 i];
+    out[u64 i] = t0;
+    i += 1;
+  }
+
+  return out;
+}
+
+
 fn __shake128_SE_k_opt(
   #spill_to_mmx reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] out,
   #spill_to_mmx reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) 
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
index 60649cab..7bc509bd 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
@@ -8,11 +8,12 @@ fn __indcpa_keypair_derand(
     stack u16[2 * NNBAR] SE;
     stack u16[NNBAR] B;
 
-    reg u64 i;
+    reg u64 i t;
 
     i = 0;
     while (i < BYTES_SEED_A/8) {
-        pk[u64 i] = coins[u64 i];
+        t = coins[u64 i];
+        pk[u64 i] = t;
         i += 1;
     }
 
@@ -30,9 +31,75 @@ fn __indcpa_keypair_derand(
 
     i = 0;
     while (i < 2 * NNBAR / 8) {
-        sk[u64 i] = SE[u64 i];
+        t = SE[u64 i];
+        sk[u64 i] = t;
         i += 1;
     }
 
     return pk, sk;
 }
+
+inline
+fn __indcpa_enc_derand(
+  #spill_to_mmx reg ptr u8[BYTES_CT - BYTES_SALT] ct,
+  #spill_to_mmx reg ptr u8[BYTES_SEC] u,
+  #spill_to_mmx reg ptr u8[BYTES_PK] pk,
+  #spill_to_mmx reg ptr u8[BYTES_SEED_SE] coins
+) -> reg ptr u8[BYTES_CT - BYTES_SALT] {
+    reg u64 i t;
+
+    // 0x96 || seed_SE
+    stack u8[1 + BYTES_SEED_SE] seedSE;
+    seedSE[0] = 0x96;
+
+    // S' || E' || E''
+    stack u16[2 * NNBAR + NBAR * NBAR] SEE;
+    stack u16[NNBAR] B;
+    reg ptr u16[NNBAR] Bp;
+    stack u16[NBAR * NBAR] C;
+    reg ptr u16[NBAR * NBAR] V;
+
+    // stack u8[BYTES_CT - BYTES_SALT] ct;
+
+    i = 0;
+    while (i < BYTES_SEED_SE/8) {
+        t = coins[u64 i];
+        seedSE.[u64 1 + 8*i] = t;
+        i += 1;
+    }
+
+    // B <- Unpack(b)
+    B = __unpack_B(B, pk[BYTES_SEED_A:D * N]);
+    C = __encode(C, u);
+
+    () = #spill(ct, u, pk, coins);
+
+    // gen input bit string for sampling S and E
+    SEE = __shake128_encap_r_opt(SEE, seedSE);
+
+    // S' || E'
+    SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]);
+    // E''
+    SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]);
+
+    // B' = S'A + E''
+    Bp = SEE[NNBAR:NNBAR];
+
+    () = #unspill(pk);
+    Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]);
+
+    // V = S'B + E''
+    V = SEE[NNBAR*2:NBAR*NBAR];
+    V = __SB_plus_E_opt(V, SEE[0:NNBAR], B);
+
+    // C = V + Encode(u)
+    C = __matrix_add(C, V);
+
+    // c1 <- Pack(B')
+    () = #unspill(ct);
+    ct[0:D * N] = __pack_B(ct[0:D * N], Bp);
+    // c2 <- Pack(C)
+    ct[D * N: D * NBAR] = __pack_C(ct[D * N: D * NBAR], C);
+
+    return ct;
+}
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
index 63172e6a..b7540e59 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
@@ -11,7 +11,7 @@ inline
 fn __frodo_amd64_ref_keypair_derand(
   reg u64 pkp skp,
   #spill_to_mmx reg ptr u8[BYTES_SEC + BYTES_SEED_SE + BYTES_SEED_A] coins) {
-    reg u64 i;
+    reg u64 i t;
 
     // seedA || b
     stack u8[BYTES_PK] pk;
@@ -23,14 +23,16 @@ fn __frodo_amd64_ref_keypair_derand(
 
     i = 0;
     while (i < BYTES_SEC/8) {
-        [skp + i*8] = coins[u64 i];
+        t = coins[u64 i];
+        [skp + i*8] = t;
         i += 1;
     }
 
     // copy seedSE
     i = 0;
     while (i < BYTES_SEED_SE/8) {
-        indcoins[u64 BYTES_SEED_A/8 + i] = coins[u64 BYTES_SEC/8 + i];
+        t = coins[u64 BYTES_SEC/8 + i];
+        indcoins[u64 BYTES_SEED_A/8 + i] = t;
         i += 1;
     }
 
@@ -44,34 +46,38 @@ fn __frodo_amd64_ref_keypair_derand(
 
     i = 0;
     while (i < BYTES_PK/8) {
-        [skp + BYTES_SEC + i*8] = pk[u64 i];
+        t = pk[u64 i];
+        [skp + BYTES_SEC + i*8] = t;
         i += 1;
     }
 
     i = 0;
     while (i < 2 * NNBAR / 8) {
-        [skp + BYTES_SEC + BYTES_PK + i*8] = sk[u64 i];
+        t = sk[u64 i];
+        [skp + BYTES_SEC + BYTES_PK + i*8] = t;
         i += 1;
     }
 
     i = 0;
     while (i < BYTES_PK/8) {
-        [pkp + i*8] = pk[u64 i];
+        t = pk[u64 i];
+        [pkp + i*8] = t;
         i += 1;
     }
 
     i = 0;
     while (i < BYTES_SEC/8) {
-        [skp + BYTES_SK - BYTES_SEC + i*8] = pkh[u64 i];
+        t = pkh[u64 i];
+        [skp + BYTES_SK - BYTES_SEC + i*8] = t;
         i += 1;
     }
 }
 
-#[returnaddress="stack"]
+inline
 fn __frodo_amd64_ref_enc_derand(
   reg u64 ctp ssp pkp,
   #spill_to_mmx reg ptr u8[BYTES_SEC + BYTES_SALT] coins) {
-    reg u64 i j;
+    reg u64 i t;
     inline int k;
 
     // seedA || b
@@ -82,93 +88,67 @@ fn __frodo_amd64_ref_enc_derand(
 
     // pkh || u || salt
     stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt;
-    // 0x96 || seedSE || k
-    stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k;
-    seedSE_k[0] = 0x96;
-
-    // S' || E' || E''
-    stack u16[2 * NNBAR + NBAR * NBAR] SEE;
+    // seedSE || k
+    stack u8[BYTES_SEED_SE + BYTES_SEC] seedSE_k;
 
-    stack u16[NNBAR] B;
-    reg ptr u16[NNBAR] Bp;
-    stack u16[NBAR * NBAR] C;
-    reg ptr u16[NBAR * NBAR] V;
     stack u8[BYTES_SEC] ss;
 
     // gen u || salt
-    for k = 0 to (BYTES_SEC + BYTES_SALT)/8 {
-        pkh_u_salt[u64 BYTES_SEC/8 + k] = coins[u64 k];
+    i = 0;
+    while (i < (BYTES_SEC + BYTES_SALT)/8) {
+        t = coins[u64 i];
+        pkh_u_salt[u64 BYTES_SEC/8 + i] = t;
+        i += 1;
     }
 
-    for k = 0 to BYTES_SALT/8 {
-        ct_k[u64 (D * N + D * NBAR)/8 + k] = pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k];
+    i = 0;
+    while (i < BYTES_SALT/8) {
+        t = coins[u64 BYTES_SEC/8 + i];
+        ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + i] = t;
+        i += 1;
     }
 
     // read pk
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_PK/8) {
-        #declassify pk[u64 i] = [pkp + j];
+        #declassify pk[u64 i] = [pkp + i*8];
         i += 1;
-        j += 8;
     }
 
-    () = #spill(ctp, ssp, i, j);
+    () = #spill(ctp, ssp, coins);
 
     // pkh
     pkh_u_salt[0:BYTES_SEC] = __shake128_pkh_opt(pkh_u_salt[0:BYTES_SEC], pk);
 
     // seedSE || k
-    seedSE_k = __shake128_SE_k_opt(seedSE_k, pkh_u_salt);
+    seedSE_k = __shake128_SE_k_opt2(seedSE_k, pkh_u_salt);
+
+    () = #unspill(coins);
+    ct_k[0:BYTES_CT - BYTES_SALT] = __indcpa_enc_derand(ct_k[0:BYTES_CT - BYTES_SALT], coins[0:BYTES_SEC], pk, seedSE_k[0:BYTES_SEED_SE]);
 
     // copy k
-    for k = 0 to BYTES_SEC/8 {
-        ct_k[u64 BYTES_CT/8 + k] = seedSE_k.[u64 1 + BYTES_SEED_SE + 8*k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = seedSE_k[u64 BYTES_SEED_SE/8 + i];
+        ct_k[u64 BYTES_CT/8 + i] = t;
+        i += 1;
     }
 
-    // gen input bit string for sampling S and E
-    SEE = __shake128_encap_r_opt(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]);
-
-    // S' || E'
-    SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]);
-    // E''
-    SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]);
-
-    // B' = S'A + E''
-    Bp = SEE[NNBAR:NNBAR];
-
-    Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]);
-
-    // c1 <- Pack(B')
-    ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp);
-
-    // B <- Unpack(b)
-    B = __unpack_B(B, pk[BYTES_SEED_A:D * N]);
-
-    // V = S'B + E''
-    V = SEE[NNBAR*2:NBAR*NBAR];
-    V = __SB_plus_E_opt(V, SEE[0:NNBAR], B);
-
-    // C = V + Encode(u)
-    C = __encode(C, pkh_u_salt[BYTES_SEC:BYTES_SEC]);
-    C = __matrix_add(C, V);
-
-    // c2 <- Pack(C)
-    ct_k[D * N: D * NBAR] = __pack_C(ct_k[D * N: D * NBAR], C);
-
     // ss <- shake(c1 || c2 || salt || k)
     ss = __shake128_ss_opt(ss, ct_k);
 
-    () = #unspill(i, j, ctp, ssp);
-    i = 0; j = 0;
+    () = #unspill(ctp, ssp);
+    i = 0;
     _ = #init_msf();
     while (i < BYTES_CT/8) {
-        [ctp + j] = ct_k[u64 i];
+        t = ct_k[u64 i];
+        [ctp + i*8] = t;
         i += 1;
-        j += 8;
     }
 
     for k = 0 to BYTES_SEC/8 {
-        [ssp + 8*k] = ss[u64 k];
+        t = ss[u64 k];
+        [ssp + 8*k] = t;
     }
 }
 

From 8bb5828c447e8785833bc6b90b03556b2ef3d6c7 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:50:25 +0800
Subject: [PATCH 16/19] indcpa dec

---
 .../frodo/common/amd64/ref/matrix.jinc        |  24 ++++
 .../frodo/frodo640shake/amd64/ref/indcpa.jinc |  22 ++++
 .../frodo/frodo640shake/amd64/ref/kem.jinc    | 115 ++++++------------
 3 files changed, 84 insertions(+), 77 deletions(-)

diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc
index 535c338a..9ae4155b 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc
@@ -82,6 +82,30 @@ fn __ct_verify_NBAR2(reg ptr u16[NBAR * NBAR] a b) -> stack u8 {
     return r;
 }
 
+fn __ct_verify(reg ptr u8[BYTES_CT - BYTES_SALT] a b) -> stack u8 {
+    reg u64 i;
+    reg u16 ac tmp;
+    reg u8 r;
+
+    i = 0;
+    ac = 0;
+    while (i < (BYTES_CT - BYTES_SALT)/2) {
+        tmp = a[u16 i];
+        tmp ^= b[u16 i];
+        ac |= tmp;
+        i += 1;
+    }
+
+    tmp = ac * -1;
+    ac |= tmp;
+    ac >>= 15;
+    ac *= (-1);
+
+    r = (8u) ac;
+
+    return r;
+}
+
 #[returnaddress="stack"]
 fn __ct_select(reg ptr u8[BYTES_SEC] out a b, reg u8 selector) -> stack u8[BYTES_SEC] {
     reg u64 i;
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
index 7bc509bd..9971e509 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
@@ -103,3 +103,25 @@ fn __indcpa_enc_derand(
 
     return ct;
 }
+
+inline
+fn __indcpa_dec(
+  #spill_to_mmx reg ptr u8[BYTES_SEC] pt,
+  #spill_to_mmx reg ptr u8[BYTES_CT - BYTES_SALT] ct,
+  #spill_to_mmx reg ptr u8[2*NNBAR] sk
+) -> reg ptr u8[BYTES_SEC] {
+    stack u16[NNBAR] Bp;
+    stack u16[NBAR * NBAR] M C;
+
+    // B' <- Unpack(c1)
+    Bp = __unpack_B(Bp, ct[0:D * N]);
+    // C <- Unpack(c2)
+    C = __unpack_C(C, ct[D * N:D * NBAR]);
+
+    // M = C - B'S
+    M = __mul_BS_opt(M, Bp, sk);
+    M = __matrix_sub(M, C);
+    pt = __decode(pt, M);
+
+    return pt;
+}
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
index b7540e59..8c5f5562 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
@@ -152,126 +152,87 @@ fn __frodo_amd64_ref_enc_derand(
     }
 }
 
-#[returnaddress="stack"]
 fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) {
     #public stack u8[BYTES_PK] pk;
     stack u8[2 * NNBAR] ST;
     stack u8[BYTES_SEC] s;
     stack u8[BYTES_CT + BYTES_SEC] ct_k;
-    stack u16[NNBAR] B Bp;
-    reg ptr u16[NNBAR] Bpp;
-    stack u16[NBAR * NBAR] M C Cp;
-    reg ptr u16[NBAR * NBAR] V;
+    stack u8[BYTES_CT - BYTES_SALT] ct2;
     stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt;
-    stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k;
+    stack u8[BYTES_SEED_SE + BYTES_SEC] seedSE_k;
     stack u8[BYTES_SEC] ss;
 
-    // S' || E' || E''
-    stack u16[2 * NNBAR + NBAR * NBAR] SEE;
-
-    reg u8 s1 s2;
-    reg u64 i j t;
-    stack u64 s_ssp s_skp;
-    inline int k;
+    reg u8 s1;
+    reg u64 i t;
 
     ctp = ctp;
     skp = skp;
-    s_ssp = ssp;
+    ssp = ssp;
 
     // copy pkh
-    for k = 0 to BYTES_SEC/8 {
-        pkh_u_salt[u64 k] = [skp + BYTES_SK - BYTES_SEC + 8*k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = [skp + BYTES_SK - BYTES_SEC + i*8];
+        pkh_u_salt[u64 i] = t;
+        i += 1;
     }
-    s_skp = skp;
 
     // read ct
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_CT/8) {
-        t = [ctp + j];
+        t = [ctp + i*8];
         ct_k[u64 i] = t;
         i += 1;
-        j += 8;
     }
 
-    for k = 0 to BYTES_SEC/8 {
-        s[u64 k] = [skp + 8*k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = [skp + i*8];
+        s[u64 i] = t;
+        i += 1;
     }
 
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_PK/8) {
-        #declassify pk[u64 i] = [skp + BYTES_SEC + j];
+        t = [skp + BYTES_SEC + i*8];
+        #declassify pk[u64 i] = t;
         i += 1;
-        j += 8;
     }
 
-    i = 0; j = 0;
+    i = 0;
     while (i < 2 * NNBAR/8) {
-        ST[u64 i] = [skp + BYTES_SEC + BYTES_PK + j];
+        t = [skp + BYTES_SEC + BYTES_PK + i*8];
+        ST[u64 i] = t;
         i += 1;
-        j += 8;
     }
 
-    () = #spill(i);
-
     // copy salt
-    for k = 0 to BYTES_SALT/8 {
-        pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k] = ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + k];
-    }
-
-    // B' <- Unpack(c1)
-    Bp = __unpack_B(Bp, ct_k[0:D * N]);
-    // C <- Unpack(c2)
-    C = __unpack_C(C, ct_k[D * N:D * NBAR]);
-
-    // M = C - B'S
-    M = __mul_BS_opt(M, Bp, ST);
-    M = __matrix_sub(M, C);
-
-    pkh_u_salt[BYTES_SEC:BYTES_SEC] = __decode(pkh_u_salt[BYTES_SEC:BYTES_SEC], M);
-
-    seedSE_k[0] = 0x96;
-    seedSE_k = __shake128_SE_k_opt(seedSE_k, pkh_u_salt);
-
-    SEE = __shake128_encap_r_opt(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]);
-
-    // S' || E'
-    SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]);
-    // E''
-    SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]);
-
-    // B'' = S'A + E'
-    Bpp = SEE[NNBAR:NNBAR];
-    Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]);
-
-    // B'' (mod q)
-    () = #unspill(i);
     i = 0;
-    while (i < NNBAR) {
-        Bpp[i] &= (1 << D) - 1;
+    while (i < BYTES_SALT/8) {
+        t = ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + i];
+        pkh_u_salt[u64 (BYTES_SEC * 2)/8 + i] = t;
         i += 1;
     }
 
-    //
-    B = __unpack_B(B, pk[BYTES_SEED_A:BYTES_PK - BYTES_SEED_A]);
+    pkh_u_salt[BYTES_SEC:BYTES_SEC] = __indcpa_dec(pkh_u_salt[BYTES_SEC:BYTES_SEC], ct_k[0:BYTES_CT - BYTES_SALT], ST);
 
-    V = SEE[NNBAR*2:NBAR*NBAR];
-    V = __SB_plus_E_opt(V, SEE[0:NNBAR], B);
-
-    Cp = __encode(Cp, pkh_u_salt[BYTES_SEC:BYTES_SEC]);
-    Cp = __matrix_add(Cp, V);
+    () = #spill(ssp);
+    seedSE_k = __shake128_SE_k_opt2(seedSE_k, pkh_u_salt);
+    ct2 = __indcpa_enc_derand(ct2, pkh_u_salt[BYTES_SEC:BYTES_SEC], pk, seedSE_k[0:BYTES_SEED_SE]);
 
-    s1 = __ct_verify_NNBAR(Bp, Bpp);
-    s2 = __ct_verify_NBAR2(C, Cp);
-    s1 |= s2;
+    s1 = __ct_verify(ct_k[0:BYTES_CT - BYTES_SALT], ct2);
 
-    ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[1+BYTES_SEED_SE:BYTES_SEC], s, s1);
+    ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[BYTES_SEED_SE:BYTES_SEC], s, s1);
 
     ss = __shake128_ss_opt(ss, ct_k);
 
     _ = #init_msf();
-    ssp = s_ssp;
-    for k = 0 to BYTES_SEC/8 {
-        [ssp + 8*k] = ss[u64 k];
+    () = #unspill(ssp);
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = ss[u64 i];
+        [ssp + i*8] = t;
+        i += 1;
     }
 }
 

From c688898b8e190b3ac874a0589971eba6a5c371a6 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Mon, 24 Jun 2024 16:32:27 +0800
Subject: [PATCH 17/19] remove returnaddress="stack"

---
 src/crypto_kem/frodo/common/amd64/ref/matrix.jinc     | 4 ----
 src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc | 4 ----
 src/crypto_kem/frodo/common/amd64/ref/shake128.jinc   | 7 -------
 src/crypto_kem/frodo/common/amd64/ref/shake256.jinc   | 6 ------
 4 files changed, 21 deletions(-)

diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc
index 9ae4155b..323e89cb 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc
@@ -14,7 +14,6 @@ fn __matrix_add(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] {
     return a;
 }
 
-#[returnaddress="stack"]
 // a = b - a
 fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] {
     reg u64 i;
@@ -32,7 +31,6 @@ fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] {
     return a;
 }
 
-#[returnaddress="stack"]
 fn __ct_verify_NNBAR(reg ptr u16[NNBAR] a b) -> stack u8 {
     reg u64 i;
     reg u16 ac tmp;
@@ -57,7 +55,6 @@ fn __ct_verify_NNBAR(reg ptr u16[NNBAR] a b) -> stack u8 {
     return r;
 }
 
-#[returnaddress="stack"]
 fn __ct_verify_NBAR2(reg ptr u16[NBAR * NBAR] a b) -> stack u8 {
     reg u64 i;
     reg u16 ac tmp;
@@ -106,7 +103,6 @@ fn __ct_verify(reg ptr u8[BYTES_CT - BYTES_SALT] a b) -> stack u8 {
     return r;
 }
 
-#[returnaddress="stack"]
 fn __ct_select(reg ptr u8[BYTES_SEC] out a b, reg u8 selector) -> stack u8[BYTES_SEC] {
     reg u64 i;
 
diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc
index 0008277a..4deee180 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc
@@ -1,6 +1,5 @@
 from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc"
 
-#[returnaddress="stack"]
 fn __AS_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] {
     stack ptr u16[NNBAR] s_B;
     stack u16[N] A_row;
@@ -64,7 +63,6 @@ fn __AS_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[
     return B;
 }
 
-#[returnaddress="stack"]
 fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] {
     stack ptr u16[NNBAR] s_B;
     stack u16[N] A_row;
@@ -123,7 +121,6 @@ fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[
     return B;
 }
 
-#[returnaddress="stack"]
 fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[NBAR * NBAR] E) -> stack u16[NBAR * NBAR] {
     reg u64 k tj;
     reg u16 tmp ac;
@@ -160,7 +157,6 @@ fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[N
     return V;
 }
 
-#[returnaddress="stack"]
 fn __mul_BS(reg ptr u16[NBAR * NBAR] M, reg ptr u16[NNBAR]B S) -> stack u16[NBAR * NBAR] {
     reg u64 k tj;
     reg u16 tmp;
diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc
index d287adb3..98cbb016 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc
@@ -2,7 +2,6 @@ from Jade require "common/keccak/keccak1600/amd64/ref1/keccak1600.jinc"
 
 param int SHAKE128_RATE = 168;
 
-#[returnaddress="stack"]
 fn __shake128_gen_A(reg ptr u8[2 * N] out, reg const ptr u8[2 + BYTES_SEED_A] in) -> stack u8[2 * N]
 {
   stack ptr u8[2 * N] s_out;
@@ -59,7 +58,6 @@ fn __shake128_gen_A(reg ptr u8[2 * N] out, reg const ptr u8[2 + BYTES_SEED_A] in
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake128_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A] in) -> stack u8[BYTES_SEED_A]
 {
   stack ptr u8[BYTES_SEED_A] s_out;
@@ -93,7 +91,6 @@ fn __shake128_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake128_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[4 * NNBAR]
 {
   stack ptr u8[4 * NNBAR] s_out;
@@ -156,7 +153,6 @@ fn __shake128_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] i
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake128_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> stack u8[BYTES_SEC]
 {
   stack ptr u8[BYTES_SEC] s_out;
@@ -220,7 +216,6 @@ fn __shake128_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> s
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake128_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) -> stack u8[BYTES_SEED_SE + BYTES_SEC] {
   stack ptr u8[BYTES_SEED_SE + BYTES_SEC] s_out;
   stack ptr u8[2 * BYTES_SEC + BYTES_SALT] s_in;
@@ -262,7 +257,6 @@ fn __shake128_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2
 }
 
 
-#[returnaddress="stack"]
 fn __shake128_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[2 * (2 * NNBAR + NBAR * NBAR)] {
   stack ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] s_out;
   stack ptr u8[1 + BYTES_SEED_SE] s_in;
@@ -323,7 +317,6 @@ fn __shake128_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const p
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake128_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SEC] in) -> stack u8[BYTES_SEC]
 {
   stack ptr u8[BYTES_SEC] s_out;
diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake256.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake256.jinc
index 1ed5acf0..ab9dadc4 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/shake256.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/shake256.jinc
@@ -2,7 +2,6 @@ from Jade require "common/keccak/keccak1600/amd64/ref1/keccak1600.jinc"
 
 param int SHAKE256_RATE = 136;
 
-#[returnaddress="stack"]
 fn __shake256_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A] in) -> stack u8[BYTES_SEED_A]
 {
   stack ptr u8[BYTES_SEED_A] s_out;
@@ -44,7 +43,6 @@ fn __shake256_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake256_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[4 * NNBAR]
 {
   stack ptr u8[4 * NNBAR] s_out;
@@ -106,7 +104,6 @@ fn __shake256_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] i
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake256_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> stack u8[BYTES_SEC]
 {
   stack ptr u8[BYTES_SEC] s_out;
@@ -169,7 +166,6 @@ fn __shake256_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> s
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake256_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) -> stack u8[BYTES_SEED_SE + BYTES_SEC] {
   stack ptr u8[BYTES_SEED_SE + BYTES_SEC] s_out;
   stack ptr u8[2 * BYTES_SEC + BYTES_SALT] s_in;
@@ -214,7 +210,6 @@ fn __shake256_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2
 }
 
 
-#[returnaddress="stack"]
 fn __shake256_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[2 * (2 * NNBAR + NBAR * NBAR)] {
   stack ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] s_out;
   stack ptr u8[1 + BYTES_SEED_SE] s_in;
@@ -274,7 +269,6 @@ fn __shake256_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const p
   return out;
 }
 
-#[returnaddress="stack"]
 fn __shake256_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SEC] in) -> stack u8[BYTES_SEC]
 {
   stack ptr u8[BYTES_SEC] s_out;

From a51c1c6b23ea0d0e33ff3319c733da9bc2720561 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Mon, 24 Jun 2024 17:11:48 +0800
Subject: [PATCH 18/19] remove unnecessary require

---
 src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc    | 2 --
 src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc
index 4deee180..55a0377e 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc
@@ -1,5 +1,3 @@
-from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc"
-
 fn __AS_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] {
     stack ptr u16[NNBAR] s_B;
     stack u16[N] A_row;
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
index 9971e509..2c181c31 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/indcpa.jinc
@@ -1,4 +1,3 @@
-
 inline
 fn __indcpa_keypair_derand(
   #spill_to_mmx reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE] coins

From 5ff8d9092a3ce1b50cd6f3fcd61a1ae8a03ea3b3 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Mon, 24 Jun 2024 17:50:29 +0800
Subject: [PATCH 19/19] update frodo976shake accordingly

---
 .../frodo/common/amd64/ref/shake128_opt.jinc  |  54 +---
 .../frodo/common/amd64/ref/shake256_opt.jinc  |  17 +-
 .../frodo/frodo640shake/amd64/ref/kem.jinc    |   4 +-
 .../frodo/frodo976shake/amd64/ref/indcpa.jinc | 126 ++++++++
 .../frodo/frodo976shake/amd64/ref/kem.jazz    |  15 +-
 .../frodo/frodo976shake/amd64/ref/kem.jinc    | 285 +++++++-----------
 6 files changed, 256 insertions(+), 245 deletions(-)
 create mode 100644 src/crypto_kem/frodo/frodo976shake/amd64/ref/indcpa.jinc

diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
index a7d481f7..657042f3 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc
@@ -268,7 +268,7 @@ fn __shake128_pkh_opt(
   return out;
 }
 
-fn __shake128_SE_k_opt2(
+fn __shake128_SE_k_opt(
   #spill_to_mmx reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out,
   #spill_to_mmx reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) 
 -> reg ptr u8[BYTES_SEED_SE + BYTES_SEC] {
@@ -319,58 +319,6 @@ fn __shake128_SE_k_opt2(
 }
 
 
-fn __shake128_SE_k_opt(
-  #spill_to_mmx reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] out,
-  #spill_to_mmx reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) 
--> reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] {
-  #spill_to_mmx reg u64 i;
-
-  stack u64[25] s_state;
-  reg ptr u64[25] state;
-  reg u64 offset t0 zero;
-  inline int INLEN OUTLEN;
-
-  INLEN = 2 * BYTES_SEC + BYTES_SALT;
-  OUTLEN = BYTES_SEED_SE + BYTES_SEC;
-
-  state = s_state;
-
-  i = 0;
-  while (i < INLEN/8) {
-    t0 = in[u64 i];
-    state[i] = t0;
-
-    i += 1;
-  }
-  ?{}, zero = #set0();
-
-  i = INLEN/8;
-  while (i < 25) {
-    state[i] = zero;
-    i += 1;
-  }
-
-  state[u8 INLEN] = 0x1f;
-  state[u8 SHAKE128_RATE-1] = 0x80;
-
-  () = #spill(out);
-
-  state = __keccakf1600_ref1(state);
-
-  () = #unspill(out);
-
-  i = 0;
-  while (i < OUTLEN/8) {
-    t0 = state[u64 i];
-    offset = #LEA(1+8*i);
-    out.[u64 offset] = t0;
-
-    i += 1;
-  }
-
-  return out;
-}
-
 fn __shake128_encap_r_opt(
   #spill_to_mmx reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out,
   #spill_to_mmx reg const ptr u8[1 + BYTES_SEED_SE] in) 
diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc
index f89b82eb..a3514809 100644
--- a/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc
+++ b/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc
@@ -89,11 +89,11 @@ fn __shake256_r_opt(
 
   i = 0;
   while (i < OUTRND * SHAKE256_RATE/8) {
-    () = #spill(i, out);
+    () = #spill(i, j, out);
 
     state = __keccakf1600_ref1(state);
 
-    () = #unspill(i, out);
+    () = #unspill(i, j, out);
 
     j = 0;
     while (j < SHAKE256_RATE/8) {
@@ -107,11 +107,11 @@ fn __shake256_r_opt(
     i += SHAKE256_RATE/8;
   }
 
-  () = #spill(i, out);
+  () = #spill(i, j, out);
 
   state = __keccakf1600_ref1(state);
 
-  () = #unspill(i, out);
+  () = #unspill(i, j, out);
 
   i = 0;
   while (i < (OUTLEN % SHAKE256_RATE) / 8) {
@@ -196,14 +196,14 @@ fn __shake256_pkh_opt(
 }
 
 fn __shake256_SE_k_opt(
-  #spill_to_mmx reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] out,
+  #spill_to_mmx reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out,
   #spill_to_mmx reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) 
--> reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] {
+-> reg ptr u8[BYTES_SEED_SE + BYTES_SEC] {
   #spill_to_mmx reg u64 i;
 
   stack u64[25] s_state;
   reg ptr u64[25] state;
-  reg u64 offset t0 zero;
+  reg u64 t0 zero;
   inline int INLEN OUTLEN;
 
   INLEN = 2 * BYTES_SEC + BYTES_SALT;
@@ -238,8 +238,7 @@ fn __shake256_SE_k_opt(
   i = 0;
   while (i < OUTLEN/8) {
     t0 = state[u64 i];
-    offset = #LEA(1+8*i);
-    out.[u64 offset] = t0;
+    out[u64 i] = t0;
 
     i += 1;
   }
diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
index 8c5f5562..77dd9e06 100644
--- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
+++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc
@@ -121,7 +121,7 @@ fn __frodo_amd64_ref_enc_derand(
     pkh_u_salt[0:BYTES_SEC] = __shake128_pkh_opt(pkh_u_salt[0:BYTES_SEC], pk);
 
     // seedSE || k
-    seedSE_k = __shake128_SE_k_opt2(seedSE_k, pkh_u_salt);
+    seedSE_k = __shake128_SE_k_opt(seedSE_k, pkh_u_salt);
 
     () = #unspill(coins);
     ct_k[0:BYTES_CT - BYTES_SALT] = __indcpa_enc_derand(ct_k[0:BYTES_CT - BYTES_SALT], coins[0:BYTES_SEC], pk, seedSE_k[0:BYTES_SEED_SE]);
@@ -217,7 +217,7 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) {
     pkh_u_salt[BYTES_SEC:BYTES_SEC] = __indcpa_dec(pkh_u_salt[BYTES_SEC:BYTES_SEC], ct_k[0:BYTES_CT - BYTES_SALT], ST);
 
     () = #spill(ssp);
-    seedSE_k = __shake128_SE_k_opt2(seedSE_k, pkh_u_salt);
+    seedSE_k = __shake128_SE_k_opt(seedSE_k, pkh_u_salt);
     ct2 = __indcpa_enc_derand(ct2, pkh_u_salt[BYTES_SEC:BYTES_SEC], pk, seedSE_k[0:BYTES_SEED_SE]);
 
     s1 = __ct_verify(ct_k[0:BYTES_CT - BYTES_SALT], ct2);
diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/indcpa.jinc b/src/crypto_kem/frodo/frodo976shake/amd64/ref/indcpa.jinc
new file mode 100644
index 00000000..534481e9
--- /dev/null
+++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/indcpa.jinc
@@ -0,0 +1,126 @@
+inline
+fn __indcpa_keypair_derand(
+  #spill_to_mmx reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE] coins
+) -> stack u8[BYTES_PK], stack u8[2*NNBAR] {
+    stack u8[BYTES_PK] pk; // seedA || b
+    stack u8[2*NNBAR] sk; // S_T
+    stack u16[2 * NNBAR] SE;
+    stack u16[NNBAR] B;
+
+    reg u64 i t;
+
+    i = 0;
+    while (i < BYTES_SEED_A/8) {
+        t = coins[u64 i];
+        pk[u64 i] = t;
+        i += 1;
+    }
+
+    () = #spill(coins);
+    // gen S || E
+    SE = __shake256_r_opt(SE, coins[BYTES_SEED_A:BYTES_SEED_SE]);
+
+    SE = __sample_2NNBAR(SE);
+
+    // B = A*S+E
+    B = __AS_plus_E_opt(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]);
+
+    // pack
+    pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B);
+
+    i = 0;
+    while (i < 2 * NNBAR / 8) {
+        t = SE[u64 i];
+        sk[u64 i] = t;
+        i += 1;
+    }
+
+    return pk, sk;
+}
+
+inline
+fn __indcpa_enc_derand(
+  #spill_to_mmx reg ptr u8[BYTES_CT - BYTES_SALT] ct,
+  #spill_to_mmx reg ptr u8[BYTES_SEC] u,
+  #spill_to_mmx reg ptr u8[BYTES_PK] pk,
+  #spill_to_mmx reg ptr u8[BYTES_SEED_SE] coins
+) -> reg ptr u8[BYTES_CT - BYTES_SALT] {
+    reg u64 i t;
+
+    // 0x96 || seed_SE
+    stack u8[1 + BYTES_SEED_SE] seedSE;
+    seedSE[0] = 0x96;
+
+    // S' || E' || E''
+    stack u16[2 * NNBAR + NBAR * NBAR] SEE;
+    stack u16[NNBAR] B;
+    reg ptr u16[NNBAR] Bp;
+    stack u16[NBAR * NBAR] C;
+    reg ptr u16[NBAR * NBAR] V;
+
+    // stack u8[BYTES_CT - BYTES_SALT] ct;
+
+    i = 0;
+    while (i < BYTES_SEED_SE/8) {
+        t = coins[u64 i];
+        seedSE.[u64 1 + 8*i] = t;
+        i += 1;
+    }
+
+    // B <- Unpack(b)
+    B = __unpack_B(B, pk[BYTES_SEED_A:D * N]);
+    C = __encode(C, u);
+
+    () = #spill(ct, u, pk, coins);
+
+    // gen input bit string for sampling S and E
+    SEE = __shake256_encap_r_opt(SEE, seedSE);
+
+    // S' || E'
+    SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]);
+    // E''
+    SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]);
+
+    // B' = S'A + E''
+    Bp = SEE[NNBAR:NNBAR];
+
+    () = #unspill(pk);
+    Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]);
+
+    // V = S'B + E''
+    V = SEE[NNBAR*2:NBAR*NBAR];
+    V = __SB_plus_E_opt(V, SEE[0:NNBAR], B);
+
+    // C = V + Encode(u)
+    C = __matrix_add(C, V);
+
+    // c1 <- Pack(B')
+    () = #unspill(ct);
+    ct[0:D * N] = __pack_B(ct[0:D * N], Bp);
+    // c2 <- Pack(C)
+    ct[D * N: D * NBAR] = __pack_C(ct[D * N: D * NBAR], C);
+
+    return ct;
+}
+
+inline
+fn __indcpa_dec(
+  #spill_to_mmx reg ptr u8[BYTES_SEC] pt,
+  #spill_to_mmx reg ptr u8[BYTES_CT - BYTES_SALT] ct,
+  #spill_to_mmx reg ptr u8[2*NNBAR] sk
+) -> reg ptr u8[BYTES_SEC] {
+    stack u16[NNBAR] Bp;
+    stack u16[NBAR * NBAR] M C;
+
+    // B' <- Unpack(c1)
+    Bp = __unpack_B(Bp, ct[0:D * N]);
+    // C <- Unpack(c2)
+    C = __unpack_C(C, ct[D * N:D * NBAR]);
+
+    // M = C - B'S
+    M = __mul_BS_opt(M, Bp, sk);
+    M = __matrix_sub(M, C);
+    pt = __decode(pt, M);
+
+    return pt;
+}
diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz
index c64c8260..3edd6dd1 100644
--- a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz
+++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz
@@ -1,36 +1,41 @@
 from Jade require "crypto_kem/frodo/common/frodo976_params.jinc"
 from Jade require "crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc"
 
-export fn jade_kem_frodo_frodo976shake_amd64_ref_keypair_derand(#public reg u64 pkp skp coinsp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo976shake_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_keypair_derand(pkp, skp, coinsp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo976shake_amd64_ref_keypair(#public reg u64 pkp skp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo976shake_amd64_ref_keypair(reg u64 pkp skp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_keypair(pkp, skp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo976shake_amd64_ref_enc_derand(#public reg u64 ctp ssp pkp coinsp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo976shake_amd64_ref_enc_derand(reg u64 ctp ssp pkp coinsp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coinsp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo976shake_amd64_ref_enc(#public reg u64 ctp ssp pkp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo976shake_amd64_ref_enc(reg u64 ctp ssp pkp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_enc(ctp, ssp, pkp);
     ?{}, r = #set0();
     return r;
 }
 
-export fn jade_kem_frodo_frodo976shake_amd64_ref_dec(#public reg u64 ssp ctp skp) -> #public reg u64 {
+export fn jade_kem_frodo_frodo976shake_amd64_ref_dec(reg u64 ssp ctp skp) -> reg u64 {
     reg u64 r;
+    _ = #init_msf();
     _frodo_amd64_ref_dec(ssp, ctp, skp);
     ?{}, r = #set0();
     return r;
diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc
index c1ed48cd..aa677759 100644
--- a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc
+++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc
@@ -5,81 +5,80 @@ from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc"
 from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc"
 from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc"
 from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc"
+require "./indcpa.jinc"
 
 // coins = s || seed SE || z
+inline
 fn __frodo_amd64_ref_keypair_derand(
   reg u64 pkp skp,
-  #spill_to_mmx reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) {
-    stack u16[2 * NNBAR] SE;
-    stack u16[NNBAR] B;
-
-    inline int k;
-    reg u64 i j;
+  #spill_to_mmx reg ptr u8[BYTES_SEC + BYTES_SEED_SE + BYTES_SEED_A] coins) {
+    reg u64 i t;
 
     // seedA || b
     stack u8[BYTES_PK] pk;
+    stack u8[BYTES_SEED_A + BYTES_SEED_SE] indcoins;
+    stack u8[BYTES_SEC] pkh;
 
-    // s || seedA || b || S_T || pkh
-    stack u8[BYTES_SK] sk;
-
-    () = #spill(i, j, pkp, skp);
+    // S_T
+    stack u8[2*NNBAR] sk;
 
-    for k = 0 to BYTES_SEC/8 {
-        sk[u64 k] = coins[u64 k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = coins[u64 i];
+        [skp + i*8] = t;
+        i += 1;
     }
 
-    // gen seedA
-    pk[0:BYTES_SEED_A] = __shake256_seed_A_opt(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]);
-
-    // gen S || E
-    SE = __shake256_r_opt(SE, coins[BYTES_SEC:BYTES_SEED_SE]);
-    SE = __sample_2NNBAR(SE);
+    // copy seedSE
+    i = 0;
+    while (i < BYTES_SEED_SE/8) {
+        t = coins[u64 BYTES_SEC/8 + i];
+        indcoins[u64 BYTES_SEED_A/8 + i] = t;
+        i += 1;
+    }
 
-    () = #spill(coins);
+    () = #spill(pkp, skp, coins);
+    indcoins[0:BYTES_SEED_A] = __shake256_seed_A_opt(indcoins[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]);
 
-    // B = A*S+E
-    B = __AS_plus_E_opt(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]);
+    pk, sk = __indcpa_keypair_derand(indcoins);
+    pkh = __shake256_pkh_opt(pkh, pk);
 
-    // pack
-    pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B);
+    () = #unspill(pkp, skp);
 
-    () = #unspill(i);
     i = 0;
     while (i < BYTES_PK/8) {
-        sk[u64 BYTES_SEC/8 + i] = pk[u64 i];
+        t = pk[u64 i];
+        [skp + BYTES_SEC + i*8] = t;
         i += 1;
     }
 
     i = 0;
     while (i < 2 * NNBAR / 8) {
-        sk[u64 BYTES_SEC/8 + BYTES_PK/8 + i] = SE[u64 i];
+        t = sk[u64 i];
+        [skp + BYTES_SEC + BYTES_PK + i*8] = t;
         i += 1;
     }
-    () = #spill(i);
-
-    sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake256_pkh_opt(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk);
 
-    () = #unspill(i, j, pkp, skp);
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_PK/8) {
-        [pkp + j] = pk[u64 i];
+        t = pk[u64 i];
+        [pkp + i*8] = t;
         i += 1;
-        j += 8;
     }
 
-    i = 0; j = 0;
-    while (i < BYTES_SK/8) {
-        [skp + j] = sk[u64 i];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = pkh[u64 i];
+        [skp + BYTES_SK - BYTES_SEC + i*8] = t;
         i += 1;
-        j += 8;
     }
 }
 
-#[returnaddress="stack"]
+inline
 fn __frodo_amd64_ref_enc_derand(
   reg u64 ctp ssp pkp,
   #spill_to_mmx reg ptr u8[BYTES_SEC + BYTES_SALT] coins) {
-    reg u64 i j;
+    reg u64 i t;
     inline int k;
 
     // seedA || b
@@ -90,40 +89,34 @@ fn __frodo_amd64_ref_enc_derand(
 
     // pkh || u || salt
     stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt;
-    // 0x96 || seedSE || k
-    stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k;
-    seedSE_k[0] = 0x96;
-
-    // S' || E' || E''
-    stack u16[2 * NNBAR + NBAR * NBAR] SEE;
+    // seedSE || k
+    stack u8[BYTES_SEED_SE + BYTES_SEC] seedSE_k;
 
-    stack u16[NNBAR] B;
-    reg ptr u16[NNBAR] Bp;
-    stack u16[NBAR * NBAR] C;
-    reg ptr u16[NBAR * NBAR] V;
     stack u8[BYTES_SEC] ss;
 
-    pkp = pkp;
-    () = #spill(ctp, ssp, i, j);
-
     // gen u || salt
-    for k = 0 to (BYTES_SEC + BYTES_SALT)/8 {
-        pkh_u_salt[u64 BYTES_SEC/8 + k] = coins[u64 k];
+    i = 0;
+    while (i < (BYTES_SEC + BYTES_SALT)/8) {
+        t = coins[u64 i];
+        pkh_u_salt[u64 BYTES_SEC/8 + i] = t;
+        i += 1;
     }
 
-    for k = 0 to BYTES_SALT/8 {
-        ct_k[u64 (D * N + D * NBAR)/8 + k] = pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k];
+    i = 0;
+    while (i < BYTES_SALT/8) {
+        t = coins[u64 BYTES_SEC/8 + i];
+        ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + i] = t;
+        i += 1;
     }
 
-    () = #unspill(i, j);
     // read pk
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_PK/8) {
-        #declassify pk[u64 i] = [pkp + j];
+        #declassify pk[u64 i] = [pkp + i*8];
         i += 1;
-        j += 8;
     }
-    () = #spill(i, j);
+
+    () = #spill(ctp, ssp, coins);
 
     // pkh
     pkh_u_salt[0:BYTES_SEC] = __shake256_pkh_opt(pkh_u_salt[0:BYTES_SEC], pk);
@@ -131,175 +124,116 @@ fn __frodo_amd64_ref_enc_derand(
     // seedSE || k
     seedSE_k = __shake256_SE_k_opt(seedSE_k, pkh_u_salt);
 
+    () = #unspill(coins);
+    ct_k[0:BYTES_CT - BYTES_SALT] = __indcpa_enc_derand(ct_k[0:BYTES_CT - BYTES_SALT], coins[0:BYTES_SEC], pk, seedSE_k[0:BYTES_SEED_SE]);
+
     // copy k
-    for k = 0 to BYTES_SEC/8 {
-        ct_k[u64 BYTES_CT/8 + k] = seedSE_k.[u64 1 + BYTES_SEED_SE + 8*k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = seedSE_k[u64 BYTES_SEED_SE/8 + i];
+        ct_k[u64 BYTES_CT/8 + i] = t;
+        i += 1;
     }
 
-    // gen input bit string for sampling S and E
-    SEE = __shake256_encap_r_opt(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]);
-
-    // S' || E'
-    SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]);
-    // E''
-    SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]);
-
-    // B' = S'A + E''
-    Bp = SEE[NNBAR:NNBAR];
-    Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]);
-
-    // c1 <- Pack(B')
-    ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp);
-
-    // B <- Unpack(b)
-    B = __unpack_B(B, pk[BYTES_SEED_A:D * N]);
-
-    // V = S'B + E''
-    V = SEE[NNBAR*2:NBAR*NBAR];
-    V = __SB_plus_E_opt(V, SEE[0:NNBAR], B);
-
-    // C = V + Encode(u)
-    C = __encode(C, pkh_u_salt[BYTES_SEC:BYTES_SEC]);
-    C = __matrix_add(C, V);
-
-    // c2 <- Pack(C)
-    ct_k[D * N: D * NBAR] = __pack_C(ct_k[D * N: D * NBAR], C);
-
     // ss <- shake(c1 || c2 || salt || k)
     ss = __shake256_ss_opt(ss, ct_k);
 
-    () = #unspill(i, j, ctp, ssp);
-    i = 0; j = 0;
+    () = #unspill(ctp, ssp);
+    i = 0;
+    _ = #init_msf();
     while (i < BYTES_CT/8) {
-        [ctp + j] = ct_k[u64 i];
+        t = ct_k[u64 i];
+        [ctp + i*8] = t;
         i += 1;
-        j += 8;
     }
 
     for k = 0 to BYTES_SEC/8 {
-        [ssp + 8*k] = ss[u64 k];
+        t = ss[u64 k];
+        [ssp + 8*k] = t;
     }
 }
 
-#[returnaddress="stack"]
 fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) {
     #public stack u8[BYTES_PK] pk;
     stack u8[2 * NNBAR] ST;
     stack u8[BYTES_SEC] s;
     stack u8[BYTES_CT + BYTES_SEC] ct_k;
-    stack u16[NNBAR] B Bp;
-    reg ptr u16[NNBAR] Bpp;
-    stack u16[NBAR * NBAR] M C Cp;
-    reg ptr u16[NBAR * NBAR] V;
+    stack u8[BYTES_CT - BYTES_SALT] ct2;
     stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt;
-    stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k;
+    stack u8[BYTES_SEED_SE + BYTES_SEC] seedSE_k;
     stack u8[BYTES_SEC] ss;
 
-    // S' || E' || E''
-    stack u16[2 * NNBAR + NBAR * NBAR] SEE;
-
-    reg u8 s1 s2;
-    reg u64 i j t;
-    stack u64 s_ssp s_skp;
-    inline int k;
+    reg u8 s1;
+    reg u64 i t;
 
     ctp = ctp;
     skp = skp;
-    s_ssp = ssp;
+    ssp = ssp;
 
     // copy pkh
-    for k = 0 to BYTES_SEC/8 {
-        pkh_u_salt[u64 k] = [skp + BYTES_SK - BYTES_SEC + 8*k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = [skp + BYTES_SK - BYTES_SEC + i*8];
+        pkh_u_salt[u64 i] = t;
+        i += 1;
     }
-    s_skp = skp;
 
     // read ct
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_CT/8) {
-        t = [ctp + j];
+        t = [ctp + i*8];
         ct_k[u64 i] = t;
         i += 1;
-        j += 8;
     }
 
-    for k = 0 to BYTES_SEC/8 {
-        s[u64 k] = [skp + 8*k];
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = [skp + i*8];
+        s[u64 i] = t;
+        i += 1;
     }
 
-    i = 0; j = 0;
+    i = 0;
     while (i < BYTES_PK/8) {
-        #declassify pk[u64 i] = [skp + BYTES_SEC + j];
+        t = [skp + BYTES_SEC + i*8];
+        #declassify pk[u64 i] = t;
         i += 1;
-        j += 8;
     }
 
-    i = 0; j = 0;
+    i = 0;
     while (i < 2 * NNBAR/8) {
-        ST[u64 i] = [skp + BYTES_SEC + BYTES_PK + j];
+        t = [skp + BYTES_SEC + BYTES_PK + i*8];
+        ST[u64 i] = t;
         i += 1;
-        j += 8;
     }
 
-    () = #spill(i);
-
     // copy salt
-    for k = 0 to BYTES_SALT/8 {
-        pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k] = ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + k];
-    }
-
-    // B' <- Unpack(c1)
-    Bp = __unpack_B(Bp, ct_k[0:D * N]);
-    // C <- Unpack(c2)
-    C = __unpack_C(C, ct_k[D * N:D * NBAR]);
-
-    // M = C - B'S
-    M = __mul_BS_opt(M, Bp, ST);
-    M = __matrix_sub(M, C);
-
-    pkh_u_salt[BYTES_SEC:BYTES_SEC] = __decode(pkh_u_salt[BYTES_SEC:BYTES_SEC], M);
-
-    seedSE_k[0] = 0x96;
-    seedSE_k = __shake256_SE_k_opt(seedSE_k, pkh_u_salt);
-
-    SEE = __shake256_encap_r_opt(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]);
-
-    // S' || E'
-    SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]);
-    // E''
-    SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]);
-
-    // B'' = S'A + E'
-    Bpp = SEE[NNBAR:NNBAR];
-    Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]);
-
-    // B'' (mod q)
-    () = #unspill(i);
     i = 0;
-    while (i < NNBAR) {
-        Bpp[i] &= (1 << D) - 1;
+    while (i < BYTES_SALT/8) {
+        t = ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + i];
+        pkh_u_salt[u64 (BYTES_SEC * 2)/8 + i] = t;
         i += 1;
     }
 
-    //
-    B = __unpack_B(B, pk[BYTES_SEED_A:BYTES_PK - BYTES_SEED_A]);
-
-    V = SEE[NNBAR*2:NBAR*NBAR];
-    V = __SB_plus_E_opt(V, SEE[0:NNBAR], B);
+    pkh_u_salt[BYTES_SEC:BYTES_SEC] = __indcpa_dec(pkh_u_salt[BYTES_SEC:BYTES_SEC], ct_k[0:BYTES_CT - BYTES_SALT], ST);
 
-    Cp = __encode(Cp, pkh_u_salt[BYTES_SEC:BYTES_SEC]);
-    Cp = __matrix_add(Cp, V);
+    () = #spill(ssp);
+    seedSE_k = __shake256_SE_k_opt(seedSE_k, pkh_u_salt);
+    ct2 = __indcpa_enc_derand(ct2, pkh_u_salt[BYTES_SEC:BYTES_SEC], pk, seedSE_k[0:BYTES_SEED_SE]);
 
-    s1 = __ct_verify_NNBAR(Bp, Bpp);
-    s2 = __ct_verify_NBAR2(C, Cp);
-    s1 |= s2;
+    s1 = __ct_verify(ct_k[0:BYTES_CT - BYTES_SALT], ct2);
 
-    ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[1+BYTES_SEED_SE:BYTES_SEC], s, s1);
+    ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[BYTES_SEED_SE:BYTES_SEC], s, s1);
 
     ss = __shake256_ss_opt(ss, ct_k);
 
-    ssp = s_ssp;
-    for k = 0 to BYTES_SEC/8 {
-        [ssp + 8*k] = ss[u64 k];
+    _ = #init_msf();
+    () = #unspill(ssp);
+    i = 0;
+    while (i < BYTES_SEC/8) {
+        t = ss[u64 i];
+        [ssp + i*8] = t;
+        i += 1;
     }
 }
 
@@ -316,7 +250,7 @@ fn _frodo_amd64_ref_keypair(reg u64 pkp skp) {
 
 fn _frodo_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) {
     #public stack u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins;
-    reg u64 i; stack u64 s_i;
+    reg u64 i;
 
     pkp = pkp;
     skp = skp;
@@ -327,7 +261,6 @@ fn _frodo_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) {
         i += 1;
     }
 
-    s_i = i;
     __frodo_amd64_ref_keypair_derand(pkp, skp, coins);
 }