/***************************************************************************
 *   Copyright (C) 2004-2025 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 2.0.2.90. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#ifndef MULTIPATTERNLENGTHS_H
#define MULTIPATTERNLENGTHS_H

#include <QList>
#include <QVector>

#include <cmath>


struct subpatternLength
{
    // Patterns of the form AND, OR, LENGTH  (example: 'alpha_aaa|tau')
    subpatternLength(const int va, const int vo, const int vl) : a(va), o(vo), l(vl) {}
    int a, o, l; // and, or, length
};

/**
   Auxiliary Multipattern Lengths struct for type patterns Context and FixedStringContext

*/
struct multipatternLengths
{
    explicit multipatternLengths(QVector<int>* plengths) : patternlengths(*plengths), blankspaces(0)
    {
        patternlengths.resize(0);
        patternlengths.append(0);
    }


    void init(const int nands)
    {
        subpatternlengths.clear();
        for (int a = 0; a < nands; ++a)
            subpatternlengths.append(subpatternLength(a, 0, 0));
        blankspaces = nands - 1;
    }
    void branch(const int nors)
    {
        const int nspl(subpatternlengths.count());
        for (int i = 0; i < nspl; ++i)
            subpatternlengths[i].o = 0;
        for (int o = 1; o < nors; ++o)
            for (int i = 0; i < nspl; ++i)
                subpatternlengths.append(subpatternLength(subpatternlengths.at(i).a, o, subpatternlengths.at(i).l));
    }
    void add(const int a, const int o, const int spl, const int stl, const bool subbranch = false)
    {
        const int nspl(subpatternlengths.count());

        for (int i = 0; i < nspl; ++i)
            if (subpatternlengths.at(i).o == o)
            {
                const int ss(subpatternlengths.at(i).l == 0 ? 0 : 1);
                if (subbranch)
                {
                    subpatternlengths.append(
                        subpatternLength(subpatternlengths.at(i).a, o, subpatternlengths.at(i).l + spl + ss));
                    subpatternlengths[i].l += stl + ss;
                }
                else
                {
                    const int l(subpatternlengths.at(i).a == a ? stl : spl);
                    subpatternlengths[i].l += l + ss;
                }
            }
    }
    void cleanup()
    {
        patternlengths.resize(0);
        const int nspl(subpatternlengths.count());
        for (int i = 0; i < nspl; ++i)
            if (!patternlengths.contains(subpatternlengths.at(i).l))
                patternlengths.append(subpatternlengths.at(i).l);
        std::sort(patternlengths.begin(), patternlengths.end());

        patternlengths = scorerLengths(patternlengths, blankspaces, false);
    }

    /**
       Multiword context type patterns 'w1 w2 ...' match also united words
       'w1w2 ...' On occasions, words in text appear either separated or united
       with often the same meaning. For example, the pair 'non parametric' and
       'nonparametric'.

       In general when using matched lengths comparisons for scoring, results
       are better when constraining blanks.

           CONTEXT: ' '  300|3300|0.900   documents:      100000
                                             sum     r-prec    map
           no spaces, no score opt          76449    0.341    0.385
           no spaces, score opt             76517    0.342    0.385
           spaces, no score opt             75727    0.336    0.379
           spaces, score opt                75850    0.338    0.380


       To accommodate cases such the one above mentioned, blank flexibilization
       is kept to nblanks - 1. For example,

         'non parametric' length: 14
         'non parametric regression' lengths: 24 and 25

       Note that 'nonparametric' will not match texts having 'non parametric'.
       To force considering these cases a dot must be specified in the pattern.
       For example,

         'non.parametric' lengths: 13 and 14
         'non.parametric regression' lengths: 24 and 25

    */
    static QVector<int> scorerLengths(const QVector<int>& plengths, const int nblanks, const bool hasdot)
    {
        const int nb(hasdot ? nblanks + 1 : nblanks);
        if (nb < 1)
            return plengths;

        const int npl(plengths.count());
        QVector<int> sls(plengths);

        for (int b = 1; b <= nb; ++b)
            for (int i = 0; i < npl; ++i)
            {
                const int l(plengths.at(i) - b);
                if (!sls.contains(l))
                    sls.append(l);
            }

        std::sort(sls.begin(), sls.end());
        return sls;
    }
    static QVector<int> scorerLengths(const int plength, const int nblanks, const bool hasdot)
    {
        QVector<int> plengths(1, plength);
        return scorerLengths(plengths, nblanks, hasdot);
    }

    QList<subpatternLength> subpatternlengths;
    QVector<int>& patternlengths;
    int blankspaces;
};

#endif
