feat: better search

This commit is contained in:
2025-02-22 10:27:34 +07:00
parent 0c7cd5d7e9
commit 31b72412fe

View File

@@ -42,7 +42,56 @@ const CONFIG = {
sourceStringContainer: "#source_phrase_container", sourceStringContainer: "#source_phrase_container",
autoSearchInterval: 1000, autoSearchInterval: 1000,
fuzzyThreshold: 0.7,
// Search thresholds and scoring configuration
thresholds: {
fuzzy: 0.7, // Base fuzzy matching threshold
wordOverlap: 0.5, // Word overlap threshold for longer phrases
// Word normalization settings
normalization: {
stripChars: /[.,!?;:'")\]}/\\]/g, // Remove these characters when normalizing words
maxCharDiff: 2, // Maximum allowed character difference for similar words
minWordLength: 4, // Minimum word length to apply fuzzy matching
minVariationSimilarity: 0.75, // Minimum similarity for word variations
wordEndings: ["s", "es", "ed", "ing", "'s"], // Common word endings to normalize
},
// Base scores
scores: {
exactMatch: 1.0,
exactWordMatch: 0.9,
contextBaseScore: 0.6,
singularPluralMatch: 0.95,
singularPluralContext: 0.85,
partialMatchBase: 0.6,
wordVariationMatch: 0.85,
},
// Multipliers and penalties
multipliers: {
autoSearchThreshold: 0.95,
singleWordThreshold: 1.4,
baseThresholdIncrease: 1.1,
positionPenalty: 1.5,
lengthDiffPenalty: 0.2,
minLengthPenaltyScore: 0.3,
},
// Weights for different scoring components
weights: {
fuzzyMatchWeight: 0.2,
wordOverlapWeight: 0.8,
positionMatchWeight: 0.4,
positionOverlapWeight: 0.6,
},
// Cache limits
cacheLimits: {
similarity: 10000,
combinations: 1000,
},
},
metadata: { metadata: {
version: "1.1.4", version: "1.1.4",
@@ -152,6 +201,60 @@ function similarity(s1, s2) {
return (longerLength - levenshteinDistance(s1, s2)) / longerLength; return (longerLength - levenshteinDistance(s1, s2)) / longerLength;
} }
function normalizeWord(word) {
// Remove specified characters
word = word
.toLowerCase()
.replace(CONFIG.thresholds.normalization.stripChars, "");
// Remove common word endings
for (const ending of CONFIG.thresholds.normalization.wordEndings) {
if (word.endsWith(ending)) {
word = word.slice(0, -ending.length);
break;
}
}
return word;
}
// Cache for word combinations and similarity scores
const combinationsCache = new Map();
const similarityCache = new Map();
function getCachedSimilarity(str1, str2) {
const key = `${str1}|${str2}`;
if (similarityCache.has(key)) {
return similarityCache.get(key);
}
const score = similarity(str1, str2);
similarityCache.set(key, score);
return score;
}
function areWordsSimilar(word1, word2) {
const norm1 = normalizeWord(word1);
const norm2 = normalizeWord(word2);
// If words are too short, require exact match
if (
norm1.length < CONFIG.thresholds.normalization.minWordLength ||
norm2.length < CONFIG.thresholds.normalization.minWordLength
) {
return norm1 === norm2;
}
// Check character difference
const charDiff = Math.abs(norm1.length - norm2.length);
if (charDiff > CONFIG.thresholds.normalization.maxCharDiff) {
return false;
}
// Calculate similarity
const similarity = getCachedSimilarity(norm1, norm2);
return similarity >= CONFIG.thresholds.normalization.minVariationSimilarity;
}
function TranslatorTool() { function TranslatorTool() {
var container; var container;
var translationData = []; var translationData = [];
@@ -865,7 +968,12 @@ function TranslatorTool() {
log("info", "Setting up event listeners"); log("info", "Setting up event listeners");
// Debounce the search with 300ms delay // Debounce the search with 300ms delay
const debouncedSearch = debounce(() => { const debouncedSearch = debounce(() => {
searchTranslations(); if (!searchInput.value.trim()) {
// If textbox is cleared, force a search of the editor content
checkForEditorContent(true);
} else {
searchTranslations(searchInput.value, false);
}
}, 300); }, 300);
searchInput.addEventListener("input", function () { searchInput.addEventListener("input", function () {
@@ -987,6 +1095,7 @@ function TranslatorTool() {
terms: content.terms, terms: content.terms,
stringId: content.stringId, stringId: content.stringId,
length: content.fullText.length, length: content.fullText.length,
lastSearchedText: lastSearchedText,
}); });
findMatches(content.fullText); findMatches(content.fullText);
} }
@@ -1298,10 +1407,6 @@ function TranslatorTool() {
}; };
} }
// Cache for word combinations
const combinationsCache = new Map();
const similarityCache = new Map();
function getCachedCombinations(text) { function getCachedCombinations(text) {
if (combinationsCache.has(text)) { if (combinationsCache.has(text)) {
return combinationsCache.get(text); return combinationsCache.get(text);
@@ -1343,237 +1448,310 @@ function TranslatorTool() {
return combinations; return combinations;
} }
function getCachedSimilarity(str1, str2) { function searchTranslations(text, isAutoSearch = false) {
const key = `${str1}|${str2}`; if (!text || !translationData.length) {
if (similarityCache.has(key)) { updateResults("");
return similarityCache.get(key); lastSearchedText = "";
return;
} }
const score = similarity(str1, str2);
similarityCache.set(key, score);
return score;
}
function findMatches(text) { // For manual search
if (!text || !translationData.length) return; let searchText = text;
if (!isAutoSearch) {
log("debug", "Finding matches for text:", { const editorTextbox = document.querySelector(CONFIG.textboxSelector);
text: text, if (editorTextbox && editorTextbox.value.trim()) {
wordCount: text.split(/\s+/).filter((w) => w.length > 0).length, searchText = editorTextbox.value;
});
const matches = [];
const seenCombinations = new Set();
const combinations = getCachedCombinations(text);
log("debug", "Generated combinations:", combinations);
// Pre-calculate source combinations for each entry
const entryCombinations = new Map();
translationData.forEach((entry) => {
entryCombinations.set(entry, getCachedCombinations(entry.source));
});
combinations.forEach(function (combination) {
if (!combination) return;
const combinationLower = combination.toLowerCase();
// Early exit if we already have enough high-quality matches
if (matches.length > 20 && matches[19].score > 0.9) {
return;
} }
translationData.forEach(function (entry) {
const uniqueKey = `${entry.source.toLowerCase()}_${
entry.category || "default"
}`;
if (seenCombinations.has(uniqueKey)) return;
const entryLower = entry.source.toLowerCase();
// For exact matches (case-insensitive)
if (entryLower === combinationLower) {
seenCombinations.add(uniqueKey);
matches.push({
entry: entry,
score: 1,
matchedWord: combination,
});
return;
}
// Only proceed if the source is significant
if (!isSignificantPhrase(entry.source)) {
return;
}
// Get cached source combinations
const sourceCombinations = entryCombinations.get(entry);
// Find best matching combination
let bestScore = 0;
let bestMatch = "";
let bestSourceCombo = "";
for (const sourceCombo of sourceCombinations) {
const score = getCachedSimilarity(
sourceCombo.toLowerCase(),
combinationLower
);
// Early exit if score is too low
if (score < 0.8) continue;
const sourceWordCount = sourceCombo.split(/\s+/).length;
const combinationWordCount = combination.split(/\s+/).length;
let adjustedScore = score;
// Heavy penalties for mismatches
if (Math.abs(sourceWordCount - combinationWordCount) > 0) {
adjustedScore *= 0.4;
}
if (combinationWordCount === 1 && sourceWordCount > 1) {
adjustedScore *= 0.3;
}
// Exact word boundary match bonus
const isExactMatch = new RegExp(`\\b${combinationLower}\\b`).test(
sourceCombo.toLowerCase()
);
if (isExactMatch) {
adjustedScore *= 1.3;
}
if (adjustedScore > bestScore) {
bestScore = adjustedScore;
bestMatch = combination;
bestSourceCombo = sourceCombo;
}
}
// Stricter thresholds
let threshold = CONFIG.fuzzyThreshold * 1.2;
if (combination.split(/\s+/).length === 1) {
threshold *= 1.4;
}
if (bestScore >= threshold && !seenCombinations.has(uniqueKey)) {
seenCombinations.add(uniqueKey);
matches.push({
entry: entry,
score: bestScore,
matchedWord: bestMatch,
});
}
});
});
// Clear caches if they get too large
if (similarityCache.size > 10000) {
similarityCache.clear();
}
if (combinationsCache.size > 1000) {
combinationsCache.clear();
} }
// Sort matches by score first, then by category const query = searchText.toLowerCase().trim();
matches.sort(function (a, b) { if (!isAutoSearch && query.length <= 1) {
const aWordCount = a.matchedWord.split(/\s+/).length;
const bWordCount = b.matchedWord.split(/\s+/).length;
if (Math.abs(b.score - a.score) < 0.05) {
if (aWordCount !== bWordCount) {
return bWordCount - aWordCount;
}
if (!!a.entry.category !== !!b.entry.category) {
return a.entry.category ? -1 : 1;
}
return b.matchedWord.length - a.matchedWord.length;
}
return b.score - a.score;
});
log(
"info",
"Final matches:",
matches.map((match) => ({
source: match.entry.source,
matchedWord: match.matchedWord,
score: Math.round(match.score * 100) + "%",
category: match.entry.category || "none",
}))
);
displayFuzzyMatches(matches);
}
function searchTranslations() {
var query = searchInput.value.toLowerCase().trim();
if (!query || query.length <= 1) {
updateResults(""); updateResults("");
lastSearchedText = ""; lastSearchedText = "";
checkForEditorContent(true); checkForEditorContent(true);
return; return;
} }
log("info", "Searching translations for", { query: query }); log(
var matches = []; "info",
`${isAutoSearch ? "Auto" : "Manual"} searching translations for`,
{
query: query,
originalText: text,
editorText: !isAutoSearch ? searchText : undefined,
isAutoSearch: isAutoSearch,
}
);
// Find matches const matches = [];
translationData.forEach(function (entry) { const seenEntries = new Set();
let score = 0;
// For short queries (2-3 chars), use stricter matching // For auto-search or long queries, break down into significant phrases
if (query.length <= 3) { const searchPhrases = [];
// Only match if it's a complete word match or surrounded by word boundaries if (isAutoSearch || query.split(/\s+/).length > 3) {
const regex = new RegExp(`\\b${query}\\b`, "i"); // Get word combinations for better partial matching
if ( searchPhrases.push(...getCachedCombinations(query));
regex.test(entry.source) || } else {
regex.test(entry.target) || searchPhrases.push(query);
(entry.note && regex.test(entry.note)) }
) {
score = 1; // Remove duplicates and empty phrases
const uniquePhrases = [...new Set(searchPhrases)].filter(
(phrase) => phrase && phrase.length > 2
);
log("debug", "Searching with phrases:", uniquePhrases);
translationData.forEach((entry) => {
const entryKey = `${entry.source}_${entry.category || ""}`;
if (seenEntries.has(entryKey)) return;
let bestScore = 0;
let bestPhrase = "";
// Try each search phrase against the entry
for (const searchPhrase of uniquePhrases) {
let score = 0;
const searchWords = searchPhrase.split(/\s+/);
// For single words or short phrases, use enhanced matching
if (searchWords.length === 1 || searchPhrase.length <= 3) {
const sourceWords = entry.source.toLowerCase().split(/\s+/);
const targetWords = entry.target.toLowerCase().split(/\s+/);
// Check for word variations and similarities
const hasVariationMatch = searchWords.some(
(searchWord) =>
sourceWords.some((sourceWord) =>
areWordsSimilar(searchWord, sourceWord)
) ||
targetWords.some((targetWord) =>
areWordsSimilar(searchWord, targetWord)
)
);
if (hasVariationMatch) {
score = CONFIG.thresholds.scores.wordVariationMatch;
// Boost score for closer matches
const bestSourceMatch = Math.max(
...sourceWords.map((w) =>
Math.max(
...searchWords.map((sw) =>
getCachedSimilarity(normalizeWord(w), normalizeWord(sw))
)
)
)
);
const bestTargetMatch = Math.max(
...targetWords.map((w) =>
Math.max(
...searchWords.map((sw) =>
getCachedSimilarity(normalizeWord(w), normalizeWord(sw))
)
)
)
);
const bestMatch = Math.max(bestSourceMatch, bestTargetMatch);
score = Math.max(score, bestMatch);
}
// Exact matching
const regex = new RegExp(`\\b${searchPhrase}\\b`, "i");
if (regex.test(entry.source) || regex.test(entry.target)) {
score = Math.max(score, CONFIG.thresholds.scores.exactWordMatch);
}
} else {
// For longer phrases, use stricter matching
const sourceWords = entry.source.toLowerCase().split(/\s+/);
const targetWords = entry.target.toLowerCase().split(/\s+/);
// Calculate word overlap with stricter position consideration
const sourceOverlap = calculateOverlapScore(searchWords, sourceWords);
const targetOverlap = calculateOverlapScore(searchWords, targetWords);
// Only use fuzzy matching if there's significant word overlap
if (
Math.max(sourceOverlap, targetOverlap) >
CONFIG.thresholds.wordOverlap
) {
const sourceScore = similarity(
entry.source.toLowerCase(),
searchPhrase
);
const targetScore = similarity(
entry.target.toLowerCase(),
searchPhrase
);
score = Math.max(sourceScore, targetScore);
// Weight the score using configured weights
const overlapWeight = Math.max(sourceOverlap, targetOverlap);
score =
score * CONFIG.thresholds.weights.fuzzyMatchWeight +
overlapWeight * CONFIG.thresholds.weights.wordOverlapWeight;
}
// Check for exact substring matches
const isExactMatch = entry.source.toLowerCase() === searchPhrase;
const isPartialMatch =
entry.source.toLowerCase().includes(searchPhrase) ||
entry.target.toLowerCase().includes(searchPhrase);
if (isExactMatch) {
score = CONFIG.thresholds.scores.exactMatch;
} else if (isPartialMatch) {
// Stricter scoring for partial matches
const matchRatio = searchPhrase.length / entry.source.length;
score = Math.max(
score,
Math.min(
CONFIG.thresholds.scores.singularPluralContext,
CONFIG.thresholds.scores.partialMatchBase + matchRatio * 0.25
)
);
}
// Length difference penalty
const lengthDiff = Math.abs(sourceWords.length - searchWords.length);
if (lengthDiff > 0) {
score *= Math.max(
CONFIG.thresholds.multipliers.minLengthPenaltyScore,
1 - lengthDiff * CONFIG.thresholds.multipliers.lengthDiffPenalty
);
}
} }
} else {
// For longer queries, use fuzzy match with context
const sourceScore = similarity(entry.source.toLowerCase(), query);
const targetScore = similarity(entry.target.toLowerCase(), query);
const noteScore = entry.note
? similarity(entry.note.toLowerCase(), query)
: 0;
// Use the highest score // Update best score if this phrase matched better
score = Math.max(sourceScore, targetScore, noteScore); if (score > bestScore) {
bestScore = score;
bestPhrase = searchPhrase;
}
} }
// Score is good enough // Apply thresholds
let threshold =
CONFIG.thresholds.fuzzy *
CONFIG.thresholds.multipliers.baseThresholdIncrease;
if (isAutoSearch) {
threshold *= CONFIG.thresholds.multipliers.autoSearchThreshold;
}
// Higher threshold for single-word matches in multi-word entries
if ( if (
(query.length <= 3 && score > 0) || bestPhrase.split(/\s+/).length === 1 &&
(query.length > 3 && score >= CONFIG.fuzzyThreshold) entry.source.split(/\s+/).length > 1
) { ) {
threshold *= CONFIG.thresholds.multipliers.singleWordThreshold;
}
if (bestScore >= threshold) {
seenEntries.add(entryKey);
matches.push({ matches.push({
entry: entry, entry,
score: score, score: bestScore,
matchedWord: bestPhrase || query,
}); });
} }
}); });
// Sort matches by score (highest first) and text length (longer matches first) // Helper function to calculate overlap score with position matching
matches.sort(function (a, b) { function calculateOverlapScore(searchWords, targetWords) {
if (b.score === a.score) { let matchCount = 0;
return b.entry.source.length - a.entry.source.length; let positionScore = 0;
for (let i = 0; i < searchWords.length; i++) {
const searchWord = searchWords[i];
const targetIndex = targetWords.indexOf(searchWord);
if (targetIndex !== -1) {
matchCount++;
const positionPenalty =
Math.abs(i - targetIndex) /
Math.max(searchWords.length, targetWords.length);
positionScore +=
1 - positionPenalty * CONFIG.thresholds.multipliers.positionPenalty;
}
} }
const matchRatio = matchCount / searchWords.length;
const avgPositionScore = matchCount > 0 ? positionScore / matchCount : 0;
return (
matchRatio * CONFIG.thresholds.weights.positionOverlapWeight +
avgPositionScore * CONFIG.thresholds.weights.positionMatchWeight
);
}
// Clear caches if they get too large
if (similarityCache.size > CONFIG.thresholds.cacheLimits.similarity)
similarityCache.clear();
if (combinationsCache.size > CONFIG.thresholds.cacheLimits.combinations)
combinationsCache.clear();
// Find all punctuation marks in the search phrase
const punctuationMarks = query.match(/[.,!?;:'")\]}/\\]/g) || [];
if (punctuationMarks.length > 0) {
// Add each punctuation mark as a separate search phrase
punctuationMarks.forEach((mark) => {
searchPhrases.push(mark);
});
// Find exact matches for punctuation marks
const exactMatches = translationData
.filter((entry) =>
punctuationMarks.some(
(mark) => entry.source.includes(mark) || entry.target.includes(mark)
)
)
.map((entry) => ({
entry,
score: 1.0,
matchedWord: query,
}));
matches.push(...exactMatches);
}
// Sort matches
matches.sort((a, b) => {
// First prioritize exact matches
if (a.score === 1 && b.score !== 1) return -1;
if (b.score === 1 && a.score !== 1) return 1;
// Then by match word count (prefer more complete matches)
const aWords = a.matchedWord.split(/\s+/).length;
const bWords = b.matchedWord.split(/\s+/).length;
if (aWords !== bWords) return bWords - aWords;
// Then by category presence
if (!!a.entry.category !== !!b.entry.category) {
return a.entry.category ? -1 : 1;
}
// Finally by score
return b.score - a.score; return b.score - a.score;
}); });
// Limit results for performance // Limit results for performance
matches = matches.slice(0, 50); const limitedMatches = matches.slice(0, 50);
log("success", "Search found matches", { count: matches.length }); log("success", "Search found matches", {
displayFuzzyMatches(matches); count: limitedMatches.length,
isAutoSearch,
matches: limitedMatches.map((m) => ({
source: m.entry.source,
score: Math.round(m.score * 100) + "%",
matchedWord: m.matchedWord,
})),
});
displayFuzzyMatches(limitedMatches);
}
function findMatches(text) {
searchTranslations(text, true);
} }
function displayFuzzyMatches(matches) { function displayFuzzyMatches(matches) {