import { isNoise } from 'libs/is-noise';
import { CLEANUP_REGEX, WORD_BOUNDARY_CHAR_REGEX } from '../text-utils';
import { Alignment } from './alignment';
export class StringAligner {
    constructor(targetSequence, targetTimestamps, insertionPenalty, deletionPenalty, substitutionPenalty, insertBetweenParagraphsPenalty, chunkSize) {
        /*
          insertedWord is the word from automatic transcription that is being inserted
          into the user text rigth after the matchingWord.
          */
        this.wordInsertionPenalty = (_insertedWord, matchingWord = '') => {
            const penalty = this.insertionPenalty;
            if (matchingWord.length > 0) {
                const lastLetter = matchingWord[matchingWord.length - 1];
                if (lastLetter === '\n') {
                    // it is cheaper to insert between paragraphs
                    return this.insertBetweenParagraphsPenalty * penalty;
                }
            }
            return penalty;
        };
        this.wordDeletionPenalty = () => {
            return this.deletionPenalty;
        };
        this.prefixDistance = (a, b) => {
            // words starting with '#' are filler words. They should not be matched
            if (b.startsWith('#')) {
                // NOTE: the returned value is just an arbitrary large number. Matching this word
                // should never be the best option.
                return 1000;
            }
            let prefixLength;
            const shorterLength = Math.min(a.length, b.length);
            const longerLength = Math.max(a.length, b.length);
            if (longerLength === 0)
                return 0;
            for (prefixLength = 0; prefixLength < shorterLength; prefixLength += 1) {
                if (a[prefixLength] !== b[prefixLength]) {
                    break;
                }
            }
            // different words with similar length are closer than words with different lengths
            // prefers aligning words to words rather than nonspeech
            const difference = longerLength - prefixLength * 0.99 - shorterLength * 0.01;
            const normalizedDistance = difference / longerLength;
            return this.substitutionPenalty * normalizedDistance;
        };
        const lowerCase = targetSequence.map((x) => x.toLowerCase().trim());
        this.targetSequence = lowerCase;
        this.aligner = new Alignment(this.targetSequence, this.prefixDistance, this.wordInsertionPenalty, this.wordDeletionPenalty, StringAligner.cleanWord, chunkSize);
        this.targetTimestamps = targetTimestamps;
        this.deletionPenalty = deletionPenalty;
        this.substitutionPenalty = substitutionPenalty;
        this.insertionPenalty = insertionPenalty;
        this.insertBetweenParagraphsPenalty = insertBetweenParagraphsPenalty;
        this.currentWord = '';
        this.currentWordBegin = null;
    }
    static cleanWords(words) {
        return words.map((x) => StringAligner.cleanWord(x));
    }
    static cleanWord(word) {
        let cleaned = word.toLowerCase().trim();
        cleaned = cleaned.replace(/[.,?]/g, '');
        cleaned = cleaned.replace(CLEANUP_REGEX, '');
        if (word.endsWith('\n'))
            cleaned += '\n'; // preserve newlines to handle correctly paragraph ends
        return cleaned;
    }
    compareSequence(sourceSequence, timeFrom, timeTo, deadlineUnixTime) {
        const indexFrom = this.indexAfterTime(timeFrom);
        const indexTo = this.indexBeforeTime(timeTo);
        const { matchIndices } = this.aligner.match(sourceSequence, indexFrom, indexTo + 1, deadlineUnixTime);
        return matchIndices;
    }
    addNewWord(word, begin, end) {
        this.extendCurrentWord(word, begin); // join phrases that form a single word (e. g. "43")
        if (this.isWordEnd(word) && this.currentWordBegin !== null) {
            // split a phrase consisting of multiple words
            const words = this.splitWords(this.currentWord);
            for (let i = 0; i < words.length; i += 1) {
                // the individual words are added each separately, but they have the same timestamps
                // maybe it would be better to split it to individual timestamps?
                const currentWord = words[i];
                this.aligner.push(StringAligner.cleanWord(currentWord));
                this.targetTimestamps.push([this.currentWordBegin, end]);
            }
            this.cleanCurrentWord();
        }
    }
    indexAfterTime(time) {
        const index = this.targetTimestamps.findIndex((t) => {
            return t[0] >= time;
        });
        if (index === -1) {
            return this.targetTimestamps.length;
        }
        return index;
    }
    indexBeforeTime(time) {
        const index = this.targetTimestamps.findIndex((t) => {
            return t[1] > time;
        });
        if (index === -1) {
            return this.targetTimestamps.length;
        }
        return index - 1;
    }
    extendCurrentWord(word, begin) {
        this.currentWord += word;
        this.currentWordBegin = this.currentWordBegin !== null ? this.currentWordBegin : begin;
    }
    cleanCurrentWord() {
        this.currentWord = '';
        this.currentWordBegin = null;
    }
    isWordEnd(word) {
        if (word.length === 0)
            return true;
        if (isNoise(word))
            return true;
        const isLastLetterPhraseEnd = WORD_BOUNDARY_CHAR_REGEX.test(word.slice(-1));
        return isLastLetterPhraseEnd;
    }
    splitWords(text) {
        const words = text.split(WORD_BOUNDARY_CHAR_REGEX).filter((word) => word.length > 0);
        return words;
    }
}
