import com.aliasi.io.LogLevel;
import com.aliasi.io.Reporter;
import com.aliasi.io.Reporters;

import com.aliasi.matrix.SvdMatrix;

import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.symbol.MapSymbolTable;

import com.aliasi.util.Files;
import com.aliasi.util.ObjectToDoubleMap;
import com.aliasi.util.Strings;

import java.io.File;
import java.io.PrintWriter;
import java.io.OutputStreamWriter;

import java.util.Arrays;
import java.util.List;


public class PartialTokenBigramSvd {


    public static void main(String[] args) throws Exception {
        System.out.println("TokenBigramSVD");

        File textFile = new File(args[0]);
        MapSymbolTable symbolTable = new MapSymbolTable();
        TokenizerFactory tokenizerFactory 
            = IndoEuropeanTokenizerFactory.INSTANCE;
        String charset = "ASCII";
        System.out.println("  Extracting Bigrams");
        System.out.println("    File=" + textFile.getCanonicalPath());
        System.out.println("    tokenizerFactory.getClass()=" 
                           + tokenizerFactory.getClass());
        System.out.println("    input charset=" + charset);
        double[][] values 
            = TokenBigramSvd
            .extractBigrams(textFile,symbolTable,tokenizerFactory,charset);

        int[][] columnIds = columnIds(values);
        double[][] partialValues = partialValues(values);

        // 2, 1.0, 0.002, 100, 0.0, 0.0, 10, 10000 : 9900 rmse=3.9478464572
  
        int maxFactors = 3;
        double featureInit = 1.0;
        double initialLearningRate = 0.002;
        int annealingRate = 100;
        double regularization = 0.0;
        double minImprovement = 0.0000;
        int minEpochs = 10;
        int maxEpochs = 10000;
        PrintWriter verbosePrintWriter
            = new PrintWriter(new OutputStreamWriter(System.out,charset));
        Reporter reporter
            = Reporters.writer(verbosePrintWriter).setLevel(LogLevel.DEBUG);

        System.out.println("  Computing SVD");
        System.out.println("    maxFactors=" + maxFactors);
        System.out.println("    featureInit=" + featureInit);
        System.out.println("    initialLearningRate=" + initialLearningRate);
        System.out.println("    annealingRate=" + annealingRate);
        System.out.println("    regularization" + regularization);
        System.out.println("    minImprovement=" + minImprovement);
        System.out.println("    minEpochs=" + minEpochs);
        System.out.println("    maxEpochs=" + maxEpochs);
        System.out.println("    output charset=" + charset);
        SvdMatrix matrix
            = SvdMatrix.partialSvd(columnIds,
                                   partialValues,
                                   maxFactors,
                                   featureInit,
                                   initialLearningRate,
                                   annealingRate,
                                   regularization,
                                   reporter,
                                   minImprovement,
                                   minEpochs,
                                   maxEpochs);

        TokenBigramSvd.reportSvd(values,matrix,symbolTable);
    }

    static int[][] columnIds(double[][] values) {
        int[][] columnIds = new int[values.length][];
        for (int i = 0; i < values.length; ++i)
            columnIds[i] = columnIdsRow(values[i]);
        return columnIds;
    }

    static int[] columnIdsRow(double[] values) {
        int count = 0;
        for (int i = 0; i < values.length; ++i)
            if (values[i] != 0)
                ++count;
        int[] columnIdsRow = new int[count];
        
        count = 0;
        for (int i = 0; i < values.length; ++i)
            if (values[i] != 0)
                columnIdsRow[count++] = i;
        return columnIdsRow;
    }



    static double[][] partialValues(double[][] values) {
        double[][] partialValues = new double[values.length][];
        for (int i = 0; i < values.length; ++i)
            partialValues[i] = partialValuesRow(values[i]);
        return partialValues;
    }

    static double[] partialValuesRow(double[] values) {
        int count = 0;
        for (int i = 0; i < values.length; ++i)
            if (values[i] != 0)
                ++count;
        double[] partialValuesRow = new double[count];

        count = 0;
        for (int i = 0; i < values.length; ++i)
            if (values[i] != 0)
                partialValuesRow[count++] = values[i];
        return partialValuesRow;
    }



}



