/*
 * Decompiled with CFR 0.152.
 */
package org.netlib.blas;

import java.util.Arrays;
import org.netlib.blas.Dgemm4x4Parallel;
import org.netlib.blas.DgemmTasks;

final class Dgemm4x4 {
    static final int MR_Height = 4;
    static final int NR_Width = 4;
    static final int MC = 384;
    static final int KC = 384;
    static final int NC = 4096;

    Dgemm4x4() {
    }

    private static void pack_A_MRxk(int k, int A_start, double[] A, int incRowA, int incColA, double[] work, int work_start) {
        int j = 0;
        while (j < k) {
            int i = 0;
            while (i < 4) {
                work[work_start + i] = A[A_start + i * incRowA];
                ++i;
            }
            work_start += 4;
            A_start += incColA;
            ++j;
        }
    }

    private static void pack_B_kxNR(int k, int B_start, double[] B, int incRowB, int incColB, double[] work, int work_start) {
        int i = 0;
        while (i < k) {
            int j = 0;
            while (j < 4) {
                work[work_start + j] = B[B_start + j * incColB];
                ++j;
            }
            work_start += 4;
            B_start += incRowB;
            ++i;
        }
    }

    static void pack_A(int mc, int kc, int A_start, double[] A, int incRowA, int incColA, double[] work) {
        int mp = mc / 4;
        int _mr = mc % 4;
        int work_start = 0;
        int i = 0;
        while (i < mp) {
            Dgemm4x4.pack_A_MRxk(kc, A_start, A, incRowA, incColA, work, work_start);
            work_start += kc * 4;
            A_start += 4 * incRowA;
            ++i;
        }
        if (_mr > 0) {
            int j = 0;
            while (j < kc) {
                i = 0;
                while (i < _mr) {
                    work[work_start + i] = A[A_start + i * incRowA];
                    ++i;
                }
                i = _mr;
                while (i < 4) {
                    work[work_start + i] = 0.0;
                    ++i;
                }
                work_start += 4;
                A_start += incColA;
                ++j;
            }
        }
    }

    static void pack_B(int kc, int nc, int B_start, double[] B, int incRowB, int incColB, double[] work) {
        int np = nc / 4;
        int _nr = nc % 4;
        int work_start = 0;
        int j = 0;
        while (j < np) {
            Dgemm4x4.pack_B_kxNR(kc, B_start, B, incRowB, incColB, work, work_start);
            work_start += kc * 4;
            B_start += 4 * incColB;
            ++j;
        }
        if (_nr > 0) {
            int i = 0;
            while (i < kc) {
                j = 0;
                while (j < _nr) {
                    work[work_start + j] = B[B_start + j * incColB];
                    ++j;
                }
                j = _nr;
                while (j < 4) {
                    work[work_start + j] = 0.0;
                    ++j;
                }
                work_start += 4;
                B_start += incRowB;
                ++i;
            }
        }
    }

    private static void dgemm_micro_kernel(int kc, double alpha, int A_panel_start, double[] A_panel, int B_panel_start, double[] B_panel, double beta, int C_panel_start, double[] C_panel, int incRowC, int incColC, double[] AB) {
        Arrays.fill(AB, 0.0);
        int l = 0;
        while (l < kc) {
            double b0j = B_panel[B_panel_start++];
            double b1j = B_panel[B_panel_start++];
            double b2j = B_panel[B_panel_start++];
            double b3j = B_panel[B_panel_start++];
            double a0i = A_panel[A_panel_start++];
            double a1i = A_panel[A_panel_start++];
            double a2i = A_panel[A_panel_start++];
            double a3i = A_panel[A_panel_start++];
            int idx = 0;
            int n = idx++;
            AB[n] = AB[n] + a0i * b0j;
            int n2 = idx++;
            AB[n2] = AB[n2] + a1i * b0j;
            int n3 = idx++;
            AB[n3] = AB[n3] + a2i * b0j;
            int n4 = idx++;
            AB[n4] = AB[n4] + a3i * b0j;
            int n5 = idx++;
            AB[n5] = AB[n5] + a0i * b1j;
            int n6 = idx++;
            AB[n6] = AB[n6] + a1i * b1j;
            int n7 = idx++;
            AB[n7] = AB[n7] + a2i * b1j;
            int n8 = idx++;
            AB[n8] = AB[n8] + a3i * b1j;
            int n9 = idx++;
            AB[n9] = AB[n9] + a0i * b2j;
            int n10 = idx++;
            AB[n10] = AB[n10] + a1i * b2j;
            int n11 = idx++;
            AB[n11] = AB[n11] + a2i * b2j;
            int n12 = idx++;
            AB[n12] = AB[n12] + a3i * b2j;
            int n13 = idx++;
            AB[n13] = AB[n13] + a0i * b3j;
            int n14 = idx++;
            AB[n14] = AB[n14] + a1i * b3j;
            int n15 = idx++;
            AB[n15] = AB[n15] + a2i * b3j;
            int n16 = idx++;
            AB[n16] = AB[n16] + a3i * b3j;
            ++l;
        }
        if (beta != 1.0) {
            Dgemm4x4.dgemm_micro_betaMulC(beta, C_panel_start, C_panel, incRowC, incColC);
        }
        Dgemm4x4.dgemm_micro_plusAlphaAB(alpha, C_panel_start, C_panel, incRowC, incColC, AB);
    }

    private static void dgemm_micro_betaMulC(double beta, int C_panel_start, double[] C_panel, int incRowC, int incColC) {
        if (beta == 0.0) {
            int j = 0;
            while (j < 4) {
                int base_C = C_panel_start + j * incColC;
                int i = 0;
                while (i < 4) {
                    C_panel[base_C + i * incRowC] = 0.0;
                    ++i;
                }
                ++j;
            }
        } else {
            int j = 0;
            while (j < 4) {
                int base_C = C_panel_start + j * incColC;
                int i = 0;
                while (i < 4) {
                    int n = base_C + i * incRowC;
                    C_panel[n] = C_panel[n] * beta;
                    ++i;
                }
                ++j;
            }
        }
    }

    private static void dgemm_micro_plusAlphaAB(double alpha, int C_panel_start, double[] C_panel, int incRowC, int incColC, double[] AB) {
        if (alpha == 1.0) {
            int j = 0;
            while (j < 4) {
                int jIdx = j * 4;
                int base_C = C_panel_start + j * incColC;
                int i = 0;
                while (i < 4) {
                    int n = base_C + i * incRowC;
                    C_panel[n] = C_panel[n] + AB[i + jIdx];
                    ++i;
                }
                ++j;
            }
        } else {
            int j = 0;
            while (j < 4) {
                int jIdx = j * 4;
                int base_C = C_panel_start + j * incColC;
                int i = 0;
                while (i < 4) {
                    int n = base_C + i * incRowC;
                    C_panel[n] = C_panel[n] + alpha * AB[i + jIdx];
                    ++i;
                }
                ++j;
            }
        }
    }

    static int dgemm_macro_kernel(int mc, int nc, int kc, double alpha, double beta, int C_start, double[] C, int incRowC, int incColC, double[] _A, double[] _B, double[] AB, double[] workC) {
        int micro_kernel_calls = 0;
        int mp = (mc + 4 - 1) / 4;
        int np = (nc + 4 - 1) / 4;
        int _mr = mc % 4;
        int _nr = nc % 4;
        int j = 0;
        while (j < np) {
            int nr = j != np - 1 || _nr == 0 ? 4 : _nr;
            int i = 0;
            while (i < mp) {
                int mr;
                int n = mr = i != mp - 1 || _mr == 0 ? 4 : _mr;
                if (mr == 4 && nr == 4) {
                    Dgemm4x4.dgemm_micro_kernel(kc, alpha, i * kc * 4, _A, j * kc * 4, _B, beta, C_start + i * 4 * incRowC + j * 4 * incColC, C, incRowC, incColC, AB);
                    ++micro_kernel_calls;
                } else {
                    Dgemm4x4.dgemm_micro_kernel(kc, alpha, i * kc * 4, _A, j * kc * 4, _B, 0.0, 0, workC, 1, 4, AB);
                    Dgemm4x4.dgescal(mr, nr, beta, C_start + i * 4 * incRowC + j * 4 * incColC, C, incRowC, incColC);
                    Dgemm4x4.dgeaxpy(mr, nr, 1.0, workC, 1, 4, C_start + i * 4 * incRowC + j * 4 * incColC, C, incRowC, incColC);
                    ++micro_kernel_calls;
                }
                ++i;
            }
            ++j;
        }
        return micro_kernel_calls;
    }

    private static void dgescal(int m, int n, double alpha, int X_start, double[] X, int incRowX, int incColX) {
        if (alpha != 0.0) {
            int j = 0;
            while (j < n) {
                int base_X = X_start + j * incColX;
                int i = 0;
                while (i < m) {
                    int n2 = base_X + i * incRowX;
                    X[n2] = X[n2] * alpha;
                    ++i;
                }
                ++j;
            }
        } else {
            int j = 0;
            while (j < n) {
                int base_X = X_start + j * incColX;
                int i = 0;
                while (i < m) {
                    X[base_X + i * incRowX] = 0.0;
                    ++i;
                }
                ++j;
            }
        }
    }

    private static void dgeaxpy(int m, int n, double alpha, double[] X, int incRowX, int incColX, int Y_start, double[] Y, int incRowY, int incColY) {
        if (alpha != 1.0) {
            int j = 0;
            while (j < n) {
                int base_Y = Y_start + j * incColY;
                int _incColX = j * incColX;
                int i = 0;
                while (i < m) {
                    int n2 = base_Y + i * incRowY;
                    Y[n2] = Y[n2] + alpha * X[i * incRowX + _incColX];
                    ++i;
                }
                ++j;
            }
        } else {
            int j = 0;
            while (j < n) {
                int base_Y = Y_start + j * incColY;
                int _incColX = j * incColX;
                int i = 0;
                while (i < m) {
                    int n3 = base_Y + i * incRowY;
                    Y[n3] = Y[n3] + X[i * incRowX + _incColX];
                    ++i;
                }
                ++j;
            }
        }
    }

    static int dgemm(int rowsA, int colsB, int colsA, double alpha, int offA, double[] A, int incRowA, int incColA, int offB, double[] B, int incRowB, int incColB, double beta, int offC, double[] C, int incRowC, int incColC) {
        int micro_kernel_calls = 0;
        if (alpha == 0.0 || colsA == 0) {
            Dgemm4x4.dgescal(rowsA, colsB, beta, offC, C, incRowC, incColC);
            return micro_kernel_calls;
        }
        int mb = (rowsA + 384 - 1) / 384;
        int nb = (colsB + 4096 - 1) / 4096;
        int kb = (colsA + 384 - 1) / 384;
        int _mc = rowsA % 384;
        int _nc = colsB % 4096;
        int _kc = colsA % 384;
        if ((mb > 1 || nb > 1) && DgemmTasks.availableCores() > 1) {
            return Dgemm4x4Parallel.dgemm(nb, kb, mb, _nc, _kc, _mc, alpha, offA, A, incRowA, incColA, offB, B, incRowB, incColB, beta, offC, C, incRowC, incColC);
        }
        double[] _A = new double[147456];
        double[] _B = new double[0x180000];
        double[] _C = new double[16];
        double[] AB = new double[16];
        int j = 0;
        while (j < nb) {
            int nc = j != nb - 1 || _nc == 0 ? 4096 : _nc;
            int l = 0;
            while (l < kb) {
                int kc = l != kb - 1 || _kc == 0 ? 384 : _kc;
                double _beta = l == 0 ? beta : 1.0;
                Dgemm4x4.pack_B(kc, nc, offB + l * 384 * incRowB + j * 4096 * incColB, B, incRowB, incColB, _B);
                int i = 0;
                while (i < mb) {
                    int mc = i != mb - 1 || _mc == 0 ? 384 : _mc;
                    Dgemm4x4.pack_A(mc, kc, offA + i * 384 * incRowA + l * 384 * incColA, A, incRowA, incColA, _A);
                    micro_kernel_calls += Dgemm4x4.dgemm_macro_kernel(mc, nc, kc, alpha, _beta, offC + i * 384 * incRowC + j * 4096 * incColC, C, incRowC, incColC, _A, _B, AB, _C);
                    ++i;
                }
                ++l;
            }
            ++j;
        }
        return micro_kernel_calls;
    }
}

