From c1b08ac0400487aba1a571e2ce0285e111868631 Mon Sep 17 00:00:00 2001
From: dan <tthsqe12@gmail.com>
Date: Fri, 15 Jul 2022 19:18:07 +0200
Subject: [PATCH] smarter hnf in fmpzi_gcd_shortest

---
 fmpzi/gcd_shortest.c  | 214 ++++++++++++++++++++++++++++--------------
 fmpzi/profile/p-gcd.c |  81 ++++++++++++++++
 2 files changed, 225 insertions(+), 70 deletions(-)
 create mode 100644 fmpzi/profile/p-gcd.c

diff --git a/fmpzi/gcd_shortest.c b/fmpzi/gcd_shortest.c
index 91a5d1dbc..e544bcfd6 100644
--- a/fmpzi/gcd_shortest.c
+++ b/fmpzi/gcd_shortest.c
@@ -183,10 +183,13 @@ void _fmpz_mat22_shortest_l_infinity(
 
 void _fmpzi_gcd_fmpz_shortest(
     fmpz_t gx, fmpz_t gy,
-    const fmpz_t ax, const fmpz_t ay,
+    const fmpz_t ax_, const fmpz_t ay_,
     const fmpz_t b)
 {
     fmpz_t A, B, C, ga, ua, va, g, u, v, axog, ayog, m1, m2, m3, m4;
+    fmpz_t t, ax_copy, ay_copy;
+    const fmpz* ax = ax_;
+    const fmpz* ay = ay_;
 
     if (fmpz_is_zero(b))
     {
@@ -194,17 +197,35 @@ void _fmpzi_gcd_fmpz_shortest(
         fmpz_set(gy, ay);
         return;
     }
-    else if (fmpz_is_zero(ax))
+
+    fmpz_init(t);
+    fmpz_init(ax_copy);
+    fmpz_init(ay_copy);
+
+    /* reduce ax+i*ay by b */
+    if (fmpz_cmpabs(ax, b) > 0)
+    {
+        fmpz_tdiv_qr(t, ax_copy, ax, b);
+        ax = ax_copy;
+    }
+
+    if (fmpz_cmpabs(ay, b) > 0)
+    {
+        fmpz_tdiv_qr(t, ay_copy, ay, b);
+        ay = ay_copy;
+    }
+
+    if (fmpz_is_zero(ax))
     {
         fmpz_gcd(gx, ay, b);
         fmpz_zero(gy);
-        return;
+        goto cleanup_stage1;
     }
     else if (fmpz_is_zero(ay))
     {
         fmpz_gcd(gx, ax, b);
         fmpz_zero(gy);
-        return;
+        goto cleanup_stage1;
     }
 
     fmpz_init(A);
@@ -223,12 +244,6 @@ void _fmpzi_gcd_fmpz_shortest(
     fmpz_init(m3);
     fmpz_init(m4);
 
-/*
-     ax ay           g*1 g*B
-    -ay ax  row ops   0  g*A
-     b  0   ------>   0   0
-     0  b             0   0
-*/
     fmpz_xgcd(ga, ua, va, ax, ay);
     fmpz_xgcd(g, u, v, ga, b);  /* v not needed */
     fmpz_divexact(axog, ax, g);
@@ -261,15 +276,54 @@ void _fmpzi_gcd_fmpz_shortest(
     fmpz_clear(m2);
     fmpz_clear(m3);
     fmpz_clear(m4);
+
+cleanup_stage1:
+    fmpz_clear(t);
+    fmpz_clear(ax_copy);
+    fmpz_clear(ay_copy);
 }
 
+#define PTR_SWAP(T, A, B)   \
+    do {                    \
+        T* __t_m_p = A;     \
+        A = B;              \
+        B = __t_m_p;        \
+    } while (0)
+
+
+/*
+     ax ay           g*1 g*B            g = gcd(gx, gy)
+    -ay ax  row ops   0  g*A        g^2*A = norm(gx + i*gy)
+     bx by  ------>   0   0
+    -by bx            0   0
 
+    Except when the gcd is purely real or purely imaginary, it is any shortest
+    vector in the above lattice in the l_infinity norm. In the exceptional case,
+    it turns out that A = 1 and B = 0, and we rely on _shortest_l_infinity
+    returning (1,0) or (0,1) instead of (1,1) in this case.
+*/
 void _fmpzi_gcd_shortest(
     fmpz_t gx, fmpz_t gy,
     const fmpz_t ax, const fmpz_t ay,
-    const fmpz_t bx, const fmpz_t by)
+    const fmpz_t bx_, const fmpz_t by_)
 {
-    fmpz_t A, B, C, axog, ayog, bxog, byog, ga, ua, va, gb, ub, vb, g, u, v, t, m1, m2, m3, m4;
+    fmpz_t A, B, C, ag, t1, t2, bg, bu, bv, g, u, v, m1, m2, m3, m4;
+    fmpz_t bx_copy, by_copy;
+    const fmpz* bx = bx_;
+    const fmpz* by = by_;
+
+    /* ensure norm(a) <= norm(b) approximately */
+    {
+        slong ax_size = fmpz_size(ax);
+        slong ay_size = fmpz_size(ay);
+        slong bx_size = fmpz_size(bx);
+        slong by_size = fmpz_size(by);
+        if (FLINT_MAX(ax_size, ay_size) > FLINT_MAX(bx_size, by_size))
+        {
+            PTR_SWAP(const fmpz, ax, bx);
+            PTR_SWAP(const fmpz, ay, by);
+        }
+    }
 
     if (fmpz_is_zero(ax))
     {
@@ -281,95 +335,115 @@ void _fmpzi_gcd_shortest(
         _fmpzi_gcd_fmpz_shortest(gx, gy, bx, by, ax);
         return;
     }
-    else if (fmpz_is_zero(bx))
-    {
-        _fmpzi_gcd_fmpz_shortest(gx, gy, ax, ay, by);
-        return;
-    }
-    else if (fmpz_is_zero(by))
-    {
-        _fmpzi_gcd_fmpz_shortest(gx, gy, ax, ay, bx);
-        return;
-    }
 
     fmpz_init(A);
     fmpz_init(B);
     fmpz_init(C);
-    fmpz_init(axog);
-    fmpz_init(ayog);
-    fmpz_init(bxog);
-    fmpz_init(byog);
-    fmpz_init(ga);
-    fmpz_init(ua);
-    fmpz_init(va);
-    fmpz_init(gb);
-    fmpz_init(ub);
-    fmpz_init(vb);
+    fmpz_init(ag);
+    fmpz_init(t1);
+    fmpz_init(t2);
+    fmpz_init(bg);
+    fmpz_init(bu);
+    fmpz_init(bv);
     fmpz_init(g);
     fmpz_init(u);
     fmpz_init(v);
-    fmpz_init(t);
     fmpz_init(m1);
     fmpz_init(m2);
     fmpz_init(m3);
     fmpz_init(m4);
+    fmpz_init(bx_copy);
+    fmpz_init(by_copy);
 
-/*
-     ax ay           g*1 g*B
-    -ay ax  row ops   0  g*A        g^2*A = norm(gx + i*gy)
-     bx by  ------>   0   0
-    -by bx            0   0
+    /*
+        first find t1*ax + t2*ay = ag
+         [ ax, ay]      [ag, t1*ay - t2*ax ]
+         [-ay, ax] ---> [ 0, (ax^2+ay^2)/ag]
+    */
+    fmpz_xgcd(ag, t1, t2, ax, ay);
+    fmpz_fmms(m1, t1, ay, t2, ax);
+    fmpz_fmma(m2, ax, ax, ay, ay);
+    fmpz_divexact(m2, m2, ag);
 
-    Except when the gcd is purely real or purely imaginary, it is any shortest
-    vector in the above lattice in the l_infinity norm. In the exceptional case,
-    it turns out that A = 1 and B = 0, and we rely on _shortest_l_infinity
-    returning (1,0) or (0,1) instead of (1,1) in this case.
-*/
-    fmpz_xgcd(ga, ua, va, ax, ay);
-    fmpz_xgcd(gb, ub, vb, bx, by);
-    fmpz_xgcd(g, u, v, ga, gb);
-    fmpz_divexact(axog, ax, g);
-    fmpz_divexact(ayog, ay, g);
-    fmpz_divexact(bxog, bx, g);
-    fmpz_divexact(byog, by, g);
-    fmpz_fmms(m1, ayog, ua, axog, va);
-    fmpz_fmma(m2, ax, axog, ay, ayog);
-    fmpz_divexact(m2, m2, ga);
-    fmpz_fmms(m3, byog, ub, bxog, vb);
-    fmpz_fmma(m4, bx, bxog, by, byog);
-    fmpz_divexact(m4, m4, gb);
-    fmpz_fmms(t, m3, ga, m1, gb);
+    /* next reduce bx+i*by by (ax^2+ay^2)/ag */
+    if (fmpz_cmpabs(bx, m2) > 0)
+    {
+        fmpz_tdiv_qr(t1, bx_copy, bx, m2);
+        bx = bx_copy;
+    }
+
+    if (fmpz_cmpabs(by, m2) > 0)
+    {
+        fmpz_tdiv_qr(t1, by_copy, by, m2);
+        by = by_copy;
+    }
+
+    /* now reduce [bx by; -by bx] */
+    if (fmpz_is_zero(bx) && fmpz_is_zero(by))
+    {
+        /* avoid division by bg below */
+        fmpz_set(gx, ax);
+        fmpz_set(gy, ay);
+        goto cleanup;
+    }
+
+    fmpz_xgcd(bg, bu, bv, bx, by);
+    fmpz_xgcd(g, u, v, ag, bg);
+
+    if (!fmpz_is_one(g))
+    {
+        fmpz_divexact(m1, m1, g);
+        fmpz_divexact(m2, m2, g);
+        fmpz_divexact(t1, bx, g);
+        fmpz_divexact(t2, by, g);
+        fmpz_fmms(m3, bu, t2, bv, t1);
+        fmpz_fmma(m4, bx, t1, by, t2);
+    }
+    else
+    {
+        fmpz_fmms(m3, bu, by, bv, bx);
+        fmpz_fmma(m4, bx, bx, by, by);
+    }
+
+    fmpz_divexact(m4, m4, bg);
+
+    /* now reduce the one remaining 2x2 */
+    fmpz_fmms(t1, m3, ag, m1, bg);
     fmpz_fmma(m1, m1, u, m3, v);
-    fmpz_divexact(m3, t, g);
+    if (fmpz_is_one(g))
+        fmpz_swap(m3, t1);
+    else
+        fmpz_divexact(m3, t1, g);
+
+    /* reduce last column */
     fmpz_gcd3(A, m2, m3, m4);
-    fmpz_fdiv_qr(t, B, m1, A);
+    fmpz_fdiv_qr(t1, B, m1, A);
     fmpz_one(C);
 
     _fmpz_mat22_shortest_l_infinity(gx, gy, u, v, C, B, A);
     fmpz_mul(gx, gx, g);
     fmpz_mul(gy, gy, g);
 
+cleanup:
+
     fmpz_clear(A);
     fmpz_clear(B);
     fmpz_clear(C);
-    fmpz_clear(axog);
-    fmpz_clear(ayog);
-    fmpz_clear(bxog);
-    fmpz_clear(byog);
-    fmpz_clear(ga);
-    fmpz_clear(ua);
-    fmpz_clear(va);
-    fmpz_clear(gb);
-    fmpz_clear(ub);
-    fmpz_clear(vb);
+    fmpz_clear(ag);
+    fmpz_clear(t1);
+    fmpz_clear(t2);
+    fmpz_clear(bg);
+    fmpz_clear(bu);
+    fmpz_clear(bv);
     fmpz_clear(g);
     fmpz_clear(u);
     fmpz_clear(v);
-    fmpz_clear(t);
     fmpz_clear(m1);
     fmpz_clear(m2);
     fmpz_clear(m3);
     fmpz_clear(m4);
+    fmpz_clear(bx_copy);
+    fmpz_clear(by_copy);
 }
 
 
diff --git a/fmpzi/profile/p-gcd.c b/fmpzi/profile/p-gcd.c
new file mode 100644
index 000000000..a66df629e
--- /dev/null
+++ b/fmpzi/profile/p-gcd.c
@@ -0,0 +1,81 @@
+/*
+    Copyright 2022 Daniel Schultz
+
+    This file is part of Arb.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "fmpzi.h"
+#include "flint/profiler.h"
+
+/* approx nbits(norm(x)) = n */
+static void fmpzi_randbits_norm(fmpzi_t x, flint_rand_t state, flint_bitcnt_t n)
+{
+    fmpz_randbits(fmpzi_realref(x), state, (n+1)/2);
+    fmpz_randtest(fmpzi_imagref(x), state, (n+0)/2);
+    if (n_randint(state, 2))
+        fmpz_swap(fmpzi_realref(x), fmpzi_imagref(x));
+}
+
+double profile_it(ulong a_bits, ulong b_bits, ulong g_bits, flint_rand_t state)
+{
+    fmpzi_t g, a, b;
+    slong i, nreps;
+    timeit_t timer;
+
+    fmpzi_init(g);
+    fmpzi_init(a);
+    fmpzi_init(b);
+
+    nreps = 1 + 5000000/(1 + a_bits + b_bits);
+
+    timeit_start(timer);
+    for (i = 0; i < nreps; i++)
+    {
+        fmpzi_randbits_norm(a, state, FLINT_MAX(1, a_bits - g_bits));
+        fmpzi_randbits_norm(b, state, FLINT_MAX(1, b_bits - g_bits));
+        fmpzi_randbits_norm(g, state, g_bits);
+        fmpzi_mul(a, a, g);
+        fmpzi_mul(b, b, g);
+        fmpzi_gcd_shortest(g, a, b);
+        fmpzi_gcd_shortest(g, b, a);
+    }
+    timeit_stop(timer);
+
+    fmpzi_clear(g);
+    fmpzi_clear(a);
+    fmpzi_clear(b);
+
+    return timer->wall/nreps;
+}
+
+
+int main(void)
+{
+    ulong i;
+    ulong a_bits, b_bits, g_bits;
+    flint_rand_t state;
+
+    flint_randinit(state);
+
+    for (a_bits = 100000; a_bits < 10000000; a_bits += 1 + a_bits/2)
+    for (b_bits = a_bits; b_bits < 10000000; b_bits += 1 + b_bits/2)
+    {
+        flint_printf("a_bits:%8wu, b_bits:%8wu |", a_bits, b_bits);
+        for (g_bits = 1; g_bits < a_bits/2; g_bits += 100 + g_bits/2)
+        {
+            double t = profile_it(a_bits, b_bits, g_bits, state);
+            flint_printf(" %5.0f", t);
+            fflush(stdout);
+        }
+        flint_printf("\n");
+    }
+
+    flint_randclear(state);
+
+    return 0;
+}