'use strict';

import arabicUtils from './arabic/arabicUtils.mjs';
import farsiWordsUtils from './farsi/farsiUtils.mjs';
import englishUtils from './english/englishUtils.mjs';

import XRegExp from 'xregexp';
const RegEx = XRegExp.default || XRegExp;

const openQuotes = '\u0022\u00AB\u201C\u201E';
const closeQuotes = '\u0022\u00BB\u201D\u201F';

function getTagRe(flags) {
  return new RegEx('<[^<>]*>', flags);
}

function getTags(text) {
  return text.match(getTagRe('g')) || [];
}

function processingText(text, parts, tagName) {
  parts.forEach((parts, index) => {
    text = text.replace(parts, '#' + tagName + '_' + index + '#');
  });
  return text;
}

function getQuotePartRe(flags) {
  const quoteRe = new RegEx(
    '[' +
      openQuotes +
      '][^' +
      openQuotes +
      closeQuotes +
      ']*[' +
      closeQuotes +
      ']',
    flags
  );
  return quoteRe;
}

function setTextParts(text, textParts, tagName) {
  textParts.forEach((part, index) => {
    text = text.replace('#' + tagName + '_' + index + '#', part);
  });
  return text;
}

function getQuotes(query) {
  //quotes „“«»“”‘’‹›"'
  const tagName = 'tag';
  const tags = getTags(query);
  const queryWithoutTags = processingText(query, tags, tagName);
  const quoteRe = getQuotePartRe('g');
  let allQuotes = queryWithoutTags.match(quoteRe) || [];
  const quoteContentRe = new XRegExp('\\p{L}{3,}');
  allQuotes = allQuotes.filter(quote => {
    return quoteContentRe.test(quote);
  });
  allQuotes = allQuotes.map(quote => {
    return setTextParts(quote, tags, tagName);
  });
  return allQuotes;
}

function getStartLine(paragraph, startLineRe) {
  var parts = paragraph.match(startLineRe) || [];
  return parts.filter(part => {
    return startLineRe.test(part);
  });
}

function preProcessing(paragraph, startLineRe) {
  return paragraph
    .replace(startLineRe, '')
    .replace(/\s{2,}/g, ' ')
    .trim();
}

function splitBySentences(paragraph, splitRe) {
  return paragraph.split(splitRe).filter(part => part);
}

function getStopwordsByLang(lang) {
  if (lang === 'en') {
    return englishUtils.getStopWordsList();
  } else if (lang === 'ar') {
    return farsiWordsUtils.getStopWordsList();
  } else if (lang === 'fa') {
    return arabicUtils.getStopWordsList();
  }
}

function filterByStopWords(words, lang) {
  var stopWords = getStopwordsByLang(lang);
  return words.filter(function(rawWord) {
    var word = replaceDiacritic(rawWord.toLowerCase(), lang);
    return !stopWords.hasOwnProperty(word);
  });
}

function tokenizing(sentence, lang) {
  if (lang === 'en') {
    sentence = englishUtils.tokenizer(sentence);
  } else if (lang === 'fa') {
    sentence = farsiWordsUtils.tokenizer(sentence);
  } else if (lang === 'ar') {
    sentence = arabicUtils.tokenizer(sentence);
  }

  return sentence;
}

function replaceDiacritic(sentence, lang) {
  if (lang === 'en') {
    sentence = englishUtils.replaceDiacritic(sentence);
  } else if (lang === 'fa') {
    sentence = farsiWordsUtils.replaceDiacritic(sentence);
  } else if (lang === 'ar') {
    sentence = arabicUtils.replaceDiacritic(sentence);
  }
  return sentence;
}

function stemming(token, lang) {
  var stem;
  if (lang === 'en') {
    stem = englishUtils.stemmer(token);
  } else if (lang === 'fa') {
    stem = farsiWordsUtils.stemmer(token);
  } else if (lang === 'ar') {
    stem = arabicUtils.stemmer(token);
  }

  return stem;
}

function getPhoneme(stem, lang) {
  var phoneme;
  if (lang === 'en') {
    phoneme = englishUtils.soundEx(stem);
  } else if (lang === 'fa') {
    phoneme = farsiWordsUtils.soundEx(stem);
  } else if (lang === 'ar') {
    phoneme = arabicUtils.soundEx(stem);
  }

  return phoneme;
}

const languagesConfig = {
  ar: 'Arabic',
  fa: 'Arabic',
  fr: 'Latin',
  en: 'Latin',
  defaultLang: 'en'
};

function escapeRegExp(str) {
  return makeString(str).replace(/([.*+?^=!:${}()|[\]\/\\])/g, '\\$1');
}

function makeString(object) {
  if (object == null) {
    return '';
  }
  return '' + object;
}

function splitEn(paragraph) {
  const enDigits = '\u06f0-\u06f9\u0660-\u0669\\d';
  const questionMark = '\u003F\u2047\uFE56\u2048\u2049\u203D\u061F';
  const ellipsisMark = '\u2026\u0EAF\u1801\u0E2F\u22EE\u22EF\u22F0\u22F1\uFE19';
  const exclamationMark =
    '\u0021\u01C3\u203C\u2048\u2049\u2755\u2757\u2762\u2763\uA71D\uA71E\uA71F\uFE57\uFF01\uE002';
  const dotMark = '\u002E';
  const tagMark = '#';

  const punctuationRe = new RegEx(
    escapeRegExp(dotMark + questionMark + exclamationMark + ellipsisMark)
  );
  const spliteRe = new XRegExp(
    '[' + punctuationRe.source + `][\\s\n\t]*(?=\\(?[\\p{Lu}<${tagMark}])`,
    'g'
  );
  const startLineRe = new XRegExp('^[^<&' + enDigits + '\\p{Lu}]+', 'i');

  const quotes = getQuotes(paragraph);
  let paragraphWithoutQuotes = processingText(paragraph, quotes, 'quote');
  const startLine = getStartLine(paragraphWithoutQuotes, startLineRe);
  paragraphWithoutQuotes = preProcessing(paragraphWithoutQuotes, startLineRe);

  const parts = splitBySentences(paragraphWithoutQuotes.trim(), spliteRe);
  parts[0] = startLine + parts[0] || '';
  const closeSigns = paragraphWithoutQuotes.match(spliteRe) || [];

  paragraphWithoutQuotes = startLine + paragraphWithoutQuotes;
  let sentences = [];
  if (parts && parts.length === 1) {
    return [setTextParts(parts[0], quotes, 'quote')];
  }

  parts.forEach(function(part, index) {
    const lastSymbol = new XRegExp('[' + punctuationRe.source + ']+$');
    const partWithoutSpace = part.trim();
    if (partWithoutSpace.replace(lastSymbol, '').length === 0) {
      return;
    }
    const sentence = partWithoutSpace + (closeSigns[index] || '');
    if (paragraphWithoutQuotes.includes(sentence)) {
      sentences.push(sentence);
    } else {
      console.log(
        'Get incorrect parsed case for paragraph ' + paragraphWithoutQuotes
      );
    }
  });

  sentences = sentences.map(function(sentence) {
    return setTextParts(sentence, quotes, 'quote');
  });
  return sentences;
}

function splitArFa(paragraph, lang) {
  const MAX_NUM_WORDS_BETWEEN_COMMA = 100;
  const arFaDigits = '\u06f0-\u06f9\u0660-\u0669';
  const waMark = '\u0648'; //wa is dot mark in arabic
  const comma = '\u060c';

  const punctuationRe = new RegEx(escapeRegExp(comma));
  const waRe = new RegEx(escapeRegExp(waMark));

  const splitByCommaRe = new XRegExp(
    '[' + punctuationRe.source + '][\\s\n\t]*(?=\\(?[\\p{Arabic}<])',
    'g'
  );
  const splitByWawRe = new XRegExp(
    '[\\s\n\t]+[' + waRe.source + '][\\s\n\t]+(?=\\(?[\\p{Arabic}<])',
    'g'
  );
  const startLineRe = new XRegExp('^[^<#' + arFaDigits + '\\p{Arabic}]+');
  const signs = [];

  paragraph = preProcessing(paragraph, startLineRe);
  const parts = splitBySentences(paragraph, splitByCommaRe);

  paragraph.replace(splitByCommaRe, function(sign) {
    signs.push(sign);
  });

  let sentences = [];
  parts.forEach(function(part, index) {
    const words = tokenizing(part, lang);
    const waSent = [];
    if (words.length > MAX_NUM_WORDS_BETWEEN_COMMA) {
      const waSigns = [];
      paragraph.replace(splitByWawRe, function(sign) {
        waSigns.push(sign);
      });
      waSent = splitBySentences(part, splitByWawRe).map(function(sent, index) {
        if (waSigns[index]) {
          return (sent += waSigns[index]);
        }
        return sent;
      });
      [].push.apply(sentences, waSent);
    } else if (signs[index]) {
      sentences.push(part + signs[index]);
    } else {
      sentences.push(part);
    }
  });
  return sentences;
}

function parseSentences(paragraph, lang) {
  let sentences = [];
  if (paragraph.trim().length === 0) {
    return sentences;
  }

  switch (lang) {
    case 'en':
    case 'fr':
    case 'pt':
    case 'de':
    case 'it':
      sentences = splitEn(paragraph);
      break;
    case 'ar':
    case 'fa':
      sentences = splitArFa(paragraph, lang);
      break;
    default:
      console.warn('Unhandled language: ' + lang + ' for split by sentences.');
  }
  return sentences;
}

export default {
  generateWords: arabicUtils.generateWords,
  filterByStopWords: filterByStopWords,
  tokenizing: tokenizing,
  replaceDiacritic: replaceDiacritic,
  stemming: stemming,
  getPhoneme: getPhoneme,
  languagesConfig,
  parseSentences
};
