✨ feat: better search

2026-01-06 04:33:03 +00:00 · 2025-02-22 10:27:34 +07:00
parent 0c7cd5d7e9
commit 31b72412fe
1 changed files with 385 additions and 207 deletions
--- a/script.user.js
+++ b/script.user.js
@@ -42,7 +42,56 @@ const CONFIG = {
  sourceStringContainer: "#source_phrase_container",
  autoSearchInterval: 1000,
-  fuzzyThreshold: 0.7,
+
  // Search thresholds and scoring configuration
  thresholds: {
    fuzzy: 0.7, // Base fuzzy matching threshold
    wordOverlap: 0.5, // Word overlap threshold for longer phrases
    // Word normalization settings
    normalization: {
      stripChars: /[.,!?;:'")\]}/\\]/g, // Remove these characters when normalizing words
      maxCharDiff: 2, // Maximum allowed character difference for similar words
      minWordLength: 4, // Minimum word length to apply fuzzy matching
      minVariationSimilarity: 0.75, // Minimum similarity for word variations
      wordEndings: ["s", "es", "ed", "ing", "'s"], // Common word endings to normalize
    },
    // Base scores
    scores: {
      exactMatch: 1.0,
      exactWordMatch: 0.9,
      contextBaseScore: 0.6,
      singularPluralMatch: 0.95,
      singularPluralContext: 0.85,
      partialMatchBase: 0.6,
      wordVariationMatch: 0.85,
    },
    // Multipliers and penalties
    multipliers: {
      autoSearchThreshold: 0.95,
      singleWordThreshold: 1.4,
      baseThresholdIncrease: 1.1,
      positionPenalty: 1.5,
      lengthDiffPenalty: 0.2,
      minLengthPenaltyScore: 0.3,
    },
    // Weights for different scoring components
    weights: {
      fuzzyMatchWeight: 0.2,
      wordOverlapWeight: 0.8,
      positionMatchWeight: 0.4,
      positionOverlapWeight: 0.6,
    },
    // Cache limits
    cacheLimits: {
      similarity: 10000,
      combinations: 1000,
    },
  },
  metadata: {
    version: "1.1.4",
@@ -152,6 +201,60 @@ function similarity(s1, s2) {
  return (longerLength - levenshteinDistance(s1, s2)) / longerLength;
 }
 function normalizeWord(word) {
  // Remove specified characters
  word = word
    .toLowerCase()
    .replace(CONFIG.thresholds.normalization.stripChars, "");
  // Remove common word endings
  for (const ending of CONFIG.thresholds.normalization.wordEndings) {
    if (word.endsWith(ending)) {
      word = word.slice(0, -ending.length);
      break;
    }
  }
  return word;
 }
 // Cache for word combinations and similarity scores
 const combinationsCache = new Map();
 const similarityCache = new Map();
 function getCachedSimilarity(str1, str2) {
  const key = `${str1}|${str2}`;
  if (similarityCache.has(key)) {
    return similarityCache.get(key);
  }
  const score = similarity(str1, str2);
  similarityCache.set(key, score);
  return score;
 }
 function areWordsSimilar(word1, word2) {
  const norm1 = normalizeWord(word1);
  const norm2 = normalizeWord(word2);
  // If words are too short, require exact match
  if (
    norm1.length < CONFIG.thresholds.normalization.minWordLength ||
    norm2.length < CONFIG.thresholds.normalization.minWordLength
  ) {
    return norm1 === norm2;
  }
  // Check character difference
  const charDiff = Math.abs(norm1.length - norm2.length);
  if (charDiff > CONFIG.thresholds.normalization.maxCharDiff) {
    return false;
  }
  // Calculate similarity
  const similarity = getCachedSimilarity(norm1, norm2);
  return similarity >= CONFIG.thresholds.normalization.minVariationSimilarity;
 }
 function TranslatorTool() {
  var container;
  var translationData = [];
@@ -865,7 +968,12 @@ function TranslatorTool() {
    log("info", "Setting up event listeners");
    // Debounce the search with 300ms delay
    const debouncedSearch = debounce(() => {
-      searchTranslations();
+      if (!searchInput.value.trim()) {
        // If textbox is cleared, force a search of the editor content
        checkForEditorContent(true);
      } else {
        searchTranslations(searchInput.value, false);
      }
    }, 300);
    searchInput.addEventListener("input", function () {
@@ -987,6 +1095,7 @@ function TranslatorTool() {
            terms: content.terms,
            stringId: content.stringId,
            length: content.fullText.length,
            lastSearchedText: lastSearchedText,
          });
          findMatches(content.fullText);
        }
@@ -1298,10 +1407,6 @@ function TranslatorTool() {
    };
  }
  // Cache for word combinations
  const combinationsCache = new Map();
  const similarityCache = new Map();
  function getCachedCombinations(text) {
    if (combinationsCache.has(text)) {
      return combinationsCache.get(text);
@@ -1343,237 +1448,310 @@ function TranslatorTool() {
    return combinations;
  }
-  function getCachedSimilarity(str1, str2) {
+  function searchTranslations(text, isAutoSearch = false) {
-    const key = `${str1}|${str2}`;
+    if (!text || !translationData.length) {
-    if (similarityCache.has(key)) {
+      updateResults("");
-      return similarityCache.get(key);
+      lastSearchedText = "";
      return;
    }
    const score = similarity(str1, str2);
    similarityCache.set(key, score);
    return score;
  }
-  function findMatches(text) {
+    // For manual search
-    if (!text || !translationData.length) return;
+    let searchText = text;
-
+    if (!isAutoSearch) {
-    log("debug", "Finding matches for text:", {
+      const editorTextbox = document.querySelector(CONFIG.textboxSelector);
-      text: text,
+      if (editorTextbox && editorTextbox.value.trim()) {
-      wordCount: text.split(/\s+/).filter((w) => w.length > 0).length,
+        searchText = editorTextbox.value;
    });
    const matches = [];
    const seenCombinations = new Set();
    const combinations = getCachedCombinations(text);
    log("debug", "Generated combinations:", combinations);
    // Pre-calculate source combinations for each entry
    const entryCombinations = new Map();
    translationData.forEach((entry) => {
      entryCombinations.set(entry, getCachedCombinations(entry.source));
    });
    combinations.forEach(function (combination) {
      if (!combination) return;
      const combinationLower = combination.toLowerCase();
      // Early exit if we already have enough high-quality matches
      if (matches.length > 20 && matches[19].score > 0.9) {
        return;
      }
      translationData.forEach(function (entry) {
        const uniqueKey = `${entry.source.toLowerCase()}_${
          entry.category || "default"
        }`;
        if (seenCombinations.has(uniqueKey)) return;
        const entryLower = entry.source.toLowerCase();
        // For exact matches (case-insensitive)
        if (entryLower === combinationLower) {
          seenCombinations.add(uniqueKey);
          matches.push({
            entry: entry,
            score: 1,
            matchedWord: combination,
          });
          return;
        }
        // Only proceed if the source is significant
        if (!isSignificantPhrase(entry.source)) {
          return;
        }
        // Get cached source combinations
        const sourceCombinations = entryCombinations.get(entry);
        // Find best matching combination
        let bestScore = 0;
        let bestMatch = "";
        let bestSourceCombo = "";
        for (const sourceCombo of sourceCombinations) {
          const score = getCachedSimilarity(
            sourceCombo.toLowerCase(),
            combinationLower
          );
          // Early exit if score is too low
          if (score < 0.8) continue;
          const sourceWordCount = sourceCombo.split(/\s+/).length;
          const combinationWordCount = combination.split(/\s+/).length;
          let adjustedScore = score;
          // Heavy penalties for mismatches
          if (Math.abs(sourceWordCount - combinationWordCount) > 0) {
            adjustedScore *= 0.4;
          }
          if (combinationWordCount === 1 && sourceWordCount > 1) {
            adjustedScore *= 0.3;
          }
          // Exact word boundary match bonus
          const isExactMatch = new RegExp(`\\b${combinationLower}\\b`).test(
            sourceCombo.toLowerCase()
          );
          if (isExactMatch) {
            adjustedScore *= 1.3;
          }
          if (adjustedScore > bestScore) {
            bestScore = adjustedScore;
            bestMatch = combination;
            bestSourceCombo = sourceCombo;
          }
        }
        // Stricter thresholds
        let threshold = CONFIG.fuzzyThreshold * 1.2;
        if (combination.split(/\s+/).length === 1) {
          threshold *= 1.4;
        }
        if (bestScore >= threshold && !seenCombinations.has(uniqueKey)) {
          seenCombinations.add(uniqueKey);
          matches.push({
            entry: entry,
            score: bestScore,
            matchedWord: bestMatch,
          });
        }
      });
    });
    // Clear caches if they get too large
    if (similarityCache.size > 10000) {
      similarityCache.clear();
    }
    if (combinationsCache.size > 1000) {
      combinationsCache.clear();
    }
-    // Sort matches by score first, then by category
+    const query = searchText.toLowerCase().trim();
-    matches.sort(function (a, b) {
+    if (!isAutoSearch && query.length <= 1) {
      const aWordCount = a.matchedWord.split(/\s+/).length;
      const bWordCount = b.matchedWord.split(/\s+/).length;
      if (Math.abs(b.score - a.score) < 0.05) {
        if (aWordCount !== bWordCount) {
          return bWordCount - aWordCount;
        }
        if (!!a.entry.category !== !!b.entry.category) {
          return a.entry.category ? -1 : 1;
        }
        return b.matchedWord.length - a.matchedWord.length;
      }
      return b.score - a.score;
    });
    log(
      "info",
      "Final matches:",
      matches.map((match) => ({
        source: match.entry.source,
        matchedWord: match.matchedWord,
        score: Math.round(match.score * 100) + "%",
        category: match.entry.category || "none",
      }))
    );
    displayFuzzyMatches(matches);
  }
  function searchTranslations() {
    var query = searchInput.value.toLowerCase().trim();
    if (!query || query.length <= 1) {
      updateResults("");
      lastSearchedText = "";
      checkForEditorContent(true);
      return;
    }
-    log("info", "Searching translations for", { query: query });
+    log(
-    var matches = [];
+      "info",
      `${isAutoSearch ? "Auto" : "Manual"} searching translations for`,
      {
        query: query,
        originalText: text,
        editorText: !isAutoSearch ? searchText : undefined,
        isAutoSearch: isAutoSearch,
      }
    );
-    // Find matches
+    const matches = [];
-    translationData.forEach(function (entry) {
+    const seenEntries = new Set();
      let score = 0;
-      // For short queries (2-3 chars), use stricter matching
+    // For auto-search or long queries, break down into significant phrases
-      if (query.length <= 3) {
+    const searchPhrases = [];
-        // Only match if it's a complete word match or surrounded by word boundaries
+    if (isAutoSearch || query.split(/\s+/).length > 3) {
-        const regex = new RegExp(`\\b${query}\\b`, "i");
+      // Get word combinations for better partial matching
-        if (
+      searchPhrases.push(...getCachedCombinations(query));
-          regex.test(entry.source) ||
+    } else {
-          regex.test(entry.target) ||
+      searchPhrases.push(query);
-          (entry.note && regex.test(entry.note))
+    }
-        ) {
+
-          score = 1;
+    // Remove duplicates and empty phrases
    const uniquePhrases = [...new Set(searchPhrases)].filter(
      (phrase) => phrase && phrase.length > 2
    );
    log("debug", "Searching with phrases:", uniquePhrases);
    translationData.forEach((entry) => {
      const entryKey = `${entry.source}_${entry.category || ""}`;
      if (seenEntries.has(entryKey)) return;
      let bestScore = 0;
      let bestPhrase = "";
      // Try each search phrase against the entry
      for (const searchPhrase of uniquePhrases) {
        let score = 0;
        const searchWords = searchPhrase.split(/\s+/);
        // For single words or short phrases, use enhanced matching
        if (searchWords.length === 1 || searchPhrase.length <= 3) {
          const sourceWords = entry.source.toLowerCase().split(/\s+/);
          const targetWords = entry.target.toLowerCase().split(/\s+/);
          // Check for word variations and similarities
          const hasVariationMatch = searchWords.some(
            (searchWord) =>
              sourceWords.some((sourceWord) =>
                areWordsSimilar(searchWord, sourceWord)
              ) ||
              targetWords.some((targetWord) =>
                areWordsSimilar(searchWord, targetWord)
              )
          );
          if (hasVariationMatch) {
            score = CONFIG.thresholds.scores.wordVariationMatch;
            // Boost score for closer matches
            const bestSourceMatch = Math.max(
              ...sourceWords.map((w) =>
                Math.max(
                  ...searchWords.map((sw) =>
                    getCachedSimilarity(normalizeWord(w), normalizeWord(sw))
                  )
                )
              )
            );
            const bestTargetMatch = Math.max(
              ...targetWords.map((w) =>
                Math.max(
                  ...searchWords.map((sw) =>
                    getCachedSimilarity(normalizeWord(w), normalizeWord(sw))
                  )
                )
              )
            );
            const bestMatch = Math.max(bestSourceMatch, bestTargetMatch);
            score = Math.max(score, bestMatch);
          }
          // Exact matching
          const regex = new RegExp(`\\b${searchPhrase}\\b`, "i");
          if (regex.test(entry.source) || regex.test(entry.target)) {
            score = Math.max(score, CONFIG.thresholds.scores.exactWordMatch);
          }
        } else {
          // For longer phrases, use stricter matching
          const sourceWords = entry.source.toLowerCase().split(/\s+/);
          const targetWords = entry.target.toLowerCase().split(/\s+/);
          // Calculate word overlap with stricter position consideration
          const sourceOverlap = calculateOverlapScore(searchWords, sourceWords);
          const targetOverlap = calculateOverlapScore(searchWords, targetWords);
          // Only use fuzzy matching if there's significant word overlap
          if (
            Math.max(sourceOverlap, targetOverlap) >
            CONFIG.thresholds.wordOverlap
          ) {
            const sourceScore = similarity(
              entry.source.toLowerCase(),
              searchPhrase
            );
            const targetScore = similarity(
              entry.target.toLowerCase(),
              searchPhrase
            );
            score = Math.max(sourceScore, targetScore);
            // Weight the score using configured weights
            const overlapWeight = Math.max(sourceOverlap, targetOverlap);
            score =
              score * CONFIG.thresholds.weights.fuzzyMatchWeight +
              overlapWeight * CONFIG.thresholds.weights.wordOverlapWeight;
          }
          // Check for exact substring matches
          const isExactMatch = entry.source.toLowerCase() === searchPhrase;
          const isPartialMatch =
            entry.source.toLowerCase().includes(searchPhrase) ||
            entry.target.toLowerCase().includes(searchPhrase);
          if (isExactMatch) {
            score = CONFIG.thresholds.scores.exactMatch;
          } else if (isPartialMatch) {
            // Stricter scoring for partial matches
            const matchRatio = searchPhrase.length / entry.source.length;
            score = Math.max(
              score,
              Math.min(
                CONFIG.thresholds.scores.singularPluralContext,
                CONFIG.thresholds.scores.partialMatchBase + matchRatio * 0.25
              )
            );
          }
          // Length difference penalty
          const lengthDiff = Math.abs(sourceWords.length - searchWords.length);
          if (lengthDiff > 0) {
            score *= Math.max(
              CONFIG.thresholds.multipliers.minLengthPenaltyScore,
              1 - lengthDiff * CONFIG.thresholds.multipliers.lengthDiffPenalty
            );
          }
        }
      } else {
        // For longer queries, use fuzzy match with context
        const sourceScore = similarity(entry.source.toLowerCase(), query);
        const targetScore = similarity(entry.target.toLowerCase(), query);
        const noteScore = entry.note
          ? similarity(entry.note.toLowerCase(), query)
          : 0;
-        // Use the highest score
+        // Update best score if this phrase matched better
-        score = Math.max(sourceScore, targetScore, noteScore);
+        if (score > bestScore) {
          bestScore = score;
          bestPhrase = searchPhrase;
        }
      }
-      // Score is good enough
+      // Apply thresholds
      let threshold =
        CONFIG.thresholds.fuzzy *
        CONFIG.thresholds.multipliers.baseThresholdIncrease;
      if (isAutoSearch) {
        threshold *= CONFIG.thresholds.multipliers.autoSearchThreshold;
      }
      // Higher threshold for single-word matches in multi-word entries
      if (
-        (query.length <= 3 && score > 0) ||
+        bestPhrase.split(/\s+/).length === 1 &&
-        (query.length > 3 && score >= CONFIG.fuzzyThreshold)
+        entry.source.split(/\s+/).length > 1
      ) {
        threshold *= CONFIG.thresholds.multipliers.singleWordThreshold;
      }
      if (bestScore >= threshold) {
        seenEntries.add(entryKey);
        matches.push({
-          entry: entry,
+          entry,
-          score: score,
+          score: bestScore,
          matchedWord: bestPhrase || query,
        });
      }
    });
-    // Sort matches by score (highest first) and text length (longer matches first)
+    // Helper function to calculate overlap score with position matching
-    matches.sort(function (a, b) {
+    function calculateOverlapScore(searchWords, targetWords) {
-      if (b.score === a.score) {
+      let matchCount = 0;
-        return b.entry.source.length - a.entry.source.length;
+      let positionScore = 0;
      for (let i = 0; i < searchWords.length; i++) {
        const searchWord = searchWords[i];
        const targetIndex = targetWords.indexOf(searchWord);
        if (targetIndex !== -1) {
          matchCount++;
          const positionPenalty =
            Math.abs(i - targetIndex) /
            Math.max(searchWords.length, targetWords.length);
          positionScore +=
            1 - positionPenalty * CONFIG.thresholds.multipliers.positionPenalty;
        }
      }
      const matchRatio = matchCount / searchWords.length;
      const avgPositionScore = matchCount > 0 ? positionScore / matchCount : 0;
      return (
        matchRatio * CONFIG.thresholds.weights.positionOverlapWeight +
        avgPositionScore * CONFIG.thresholds.weights.positionMatchWeight
      );
    }
    // Clear caches if they get too large
    if (similarityCache.size > CONFIG.thresholds.cacheLimits.similarity)
      similarityCache.clear();
    if (combinationsCache.size > CONFIG.thresholds.cacheLimits.combinations)
      combinationsCache.clear();
    // Find all punctuation marks in the search phrase
    const punctuationMarks = query.match(/[.,!?;:'")\]}/\\]/g) || [];
    if (punctuationMarks.length > 0) {
      // Add each punctuation mark as a separate search phrase
      punctuationMarks.forEach((mark) => {
        searchPhrases.push(mark);
      });
      // Find exact matches for punctuation marks
      const exactMatches = translationData
        .filter((entry) =>
          punctuationMarks.some(
            (mark) => entry.source.includes(mark) || entry.target.includes(mark)
          )
        )
        .map((entry) => ({
          entry,
          score: 1.0,
          matchedWord: query,
        }));
      matches.push(...exactMatches);
    }
    // Sort matches
    matches.sort((a, b) => {
      // First prioritize exact matches
      if (a.score === 1 && b.score !== 1) return -1;
      if (b.score === 1 && a.score !== 1) return 1;
      // Then by match word count (prefer more complete matches)
      const aWords = a.matchedWord.split(/\s+/).length;
      const bWords = b.matchedWord.split(/\s+/).length;
      if (aWords !== bWords) return bWords - aWords;
      // Then by category presence
      if (!!a.entry.category !== !!b.entry.category) {
        return a.entry.category ? -1 : 1;
      }
      // Finally by score
      return b.score - a.score;
    });
    // Limit results for performance
-    matches = matches.slice(0, 50);
+    const limitedMatches = matches.slice(0, 50);
-    log("success", "Search found matches", { count: matches.length });
+    log("success", "Search found matches", {
-    displayFuzzyMatches(matches);
+      count: limitedMatches.length,
      isAutoSearch,
      matches: limitedMatches.map((m) => ({
        source: m.entry.source,
        score: Math.round(m.score * 100) + "%",
        matchedWord: m.matchedWord,
      })),
    });
    displayFuzzyMatches(limitedMatches);
  }
  function findMatches(text) {
    searchTranslations(text, true);
  }
  function displayFuzzyMatches(matches) {