Implementing Emoji Search
A good emoji search feels instant and returns relevant results even for approximate queries. Typing "smile" should surface 😊 before it surfaces 🧪. Typing "rkt" should still find 🚀. This guide covers the data, algorithms, and trade-offs involved in building emoji search from scratch.
The Data Foundation: Unicode CLDRCLDR (CLDR)
Common Locale Data Repository — โปรเจกต์ Unicode ที่ให้ข้อมูลเฉพาะท้องถิ่น รวมถึงชื่ออิโมจิและคำหลักในการค้นหามากกว่า 100 ภาษา
The Unicode ConsortiumUnicode Consortium
องค์กรไม่แสวงหากำไรที่พัฒนาและดูแลมาตรฐาน Unicode รวมถึงกระบวนการเพิ่มอิโมจิใหม่'s Common Locale Data Repository (CLDR) provides the best source of emoji search data. It includes:
- Short names: A concise English description (e.g., "grinning face")
- Keywords/annotations: Synonyms and related terms (e.g., "face | grin | happy")
- Localized annotations: The same in 60+ languages
The emoji data from CLDR is available via the @unicode/cldr-annotations package or pre-processed in libraries like emojibase-data.
npm install emojibase-data
import data from 'emojibase-data/en/data.json';
import annotations from 'emojibase-data/en/messages.json';
// data[0] example:
// {
// label: "grinning face",
// tags: ["face", "grin", "happy"],
// emoji: "😀",
// hexcode: "1F600",
// group: 0,
// order: 1,
// version: 1.0
// }
Building an Inverted Index
For fast local search over 3,000+ emoji, build an inverted index at startup time:
class EmojiSearchIndex {
constructor(emojis) {
this.emojis = emojis;
this.index = this._buildIndex(emojis);
}
_buildIndex(emojis) {
const index = new Map(); // token → Set of emoji indices
emojis.forEach((emoji, i) => {
const tokens = this._tokenize([emoji.label, ...(emoji.tags || [])].join(' '));
for (const token of tokens) {
if (!index.has(token)) index.set(token, new Set());
index.get(token).add(i);
}
});
return index;
}
_tokenize(text) {
return text
.toLowerCase()
.replace(/[^a-z0-9 ]/g, ' ')
.split(/\s+/)
.filter(t => t.length > 1);
}
search(query) {
const tokens = this._tokenize(query);
if (tokens.length === 0) return this.emojis;
// Intersection of sets for multi-token queries
let candidates = null;
for (const token of tokens) {
// Prefix matching: find all index keys that start with this token
const matches = new Set();
for (const [key, indices] of this.index.entries()) {
if (key.startsWith(token)) {
for (const idx of indices) matches.add(idx);
}
}
candidates = candidates === null
? matches
: new Set([...candidates].filter(x => matches.has(x)));
}
return candidates
? [...candidates].map(i => this.emojis[i])
: [];
}
}
const index = new EmojiSearchIndex(data);
console.log(index.search("rocket").map(e => e.emoji));
// ['🚀', '🛸', '🛩️']
Relevance Ranking
Raw set intersection returns results in arbitrary order. Add scoring to rank by relevance:
function scoreEmoji(emoji, query) {
const q = query.toLowerCase();
const label = emoji.label.toLowerCase();
const tags = (emoji.tags || []).map(t => t.toLowerCase());
let score = 0;
// Exact label match: highest score
if (label === q) score += 100;
// Label starts with query
else if (label.startsWith(q)) score += 50;
// Label contains query
else if (label.includes(q)) score += 20;
// Tag exact match
if (tags.includes(q)) score += 30;
// Tag prefix match
else if (tags.some(t => t.startsWith(q))) score += 10;
// Frequency bonus (if you track usage)
score += (emoji.usageCount || 0) * 0.1;
return score;
}
function rankedSearch(index, query) {
const results = index.search(query);
return results
.map(emoji => ({ emoji, score: scoreEmoji(emoji, query) }))
.sort((a, b) => b.score - a.score)
.map(r => r.emoji);
}
Fuzzy Matching
For typo tolerance, add fuzzy matching as a fallback when exact/prefix search returns few results:
Levenshtein Distance
function levenshtein(a, b) {
const m = a.length, n = b.length;
const dp = Array.from({ length: m + 1 }, (_, i) =>
Array.from({ length: n + 1 }, (_, j) => (i === 0 ? j : j === 0 ? i : 0))
);
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
dp[i][j] = a[i - 1] === b[j - 1]
? dp[i - 1][j - 1]
: 1 + Math.min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]);
}
}
return dp[m][n];
}
function fuzzyEmojiSearch(emojis, query, maxDistance = 2) {
const q = query.toLowerCase();
return emojis
.map(emoji => {
const dist = Math.min(
levenshtein(q, emoji.label),
...(emoji.tags || []).map(t => levenshtein(q, t))
);
return { emoji, dist };
})
.filter(r => r.dist <= maxDistance)
.sort((a, b) => a.dist - b.dist)
.map(r => r.emoji);
}
Using Fuse.js for Production
import Fuse from 'fuse.js';
const fuse = new Fuse(data, {
keys: [
{ name: 'label', weight: 0.7 },
{ name: 'tags', weight: 0.3 },
],
threshold: 0.4, // 0 = exact, 1 = match anything
includeScore: true,
minMatchCharLength: 2,
useExtendedSearch: true,
});
// Basic search
const results = fuse.search('smiling');
console.log(results.map(r => r.item.emoji)); // ['😊', '😁', '😄', ...]
// Extended search: require "smile" AND start with "happy"
const advanced = fuse.search("'smile ^happy");
Handling Multi-Word Queries
Multi-word queries need special treatment to avoid over-filtering:
function multiWordSearch(index, query) {
const words = query.trim().toLowerCase().split(/\s+/);
if (words.length === 1) {
return rankedSearch(index, query);
}
// Try full phrase first
let results = rankedSearch(index, query);
// Fall back to OR logic if too few results
if (results.length < 5) {
const perWord = words.map(w => new Set(rankedSearch(index, w).map(e => e.emoji)));
const union = new Set([...perWord].flatMap(s => [...s]));
results = index.emojis.filter(e => union.has(e.emoji));
}
return results;
}
Emoji Aliasing and Synonyms
CLDR tags are good but incomplete. Augment with hand-curated synonyms for common queries:
const ALIASES = {
'lol': ['😂', '🤣'],
'love': ['❤️', '😍', '🥰'],
'cool': ['😎', '🆒'],
'fire': ['🔥'],
'goat': ['🐐', '🏆'], // "Greatest of All Time"
'ship': ['🚢', '❤️'], // shipping (romance)
'pray': ['🙏'],
'muscle': ['💪'],
'money': ['💰', '💵', '🤑'],
'party': ['🎉', '🥳', '🎊'],
};
function aliasLookup(query) {
return ALIASES[query.toLowerCase()] || [];
}
function fullSearch(index, query) {
const aliased = aliasLookup(query);
const searched = rankedSearch(index, query);
// Merge: aliases first (they are intentional matches), then search results
const seen = new Set(aliased);
const deduped = [
...aliased,
...searched.filter(e => !seen.has(e.emoji)).map(e => e.emoji),
];
return deduped.map(emoji => index.emojis.find(e => e.emoji === emoji)).filter(Boolean);
}
Python Implementation
from collections import defaultdict
import re
class EmojiSearchIndex:
def __init__(self, emojis: list[dict]) -> None:
self.emojis = emojis
self.index: dict[str, set[int]] = defaultdict(set)
self._build()
def _tokenize(self, text: str) -> list[str]:
return [t for t in re.split(r'\W+', text.lower()) if len(t) > 1]
def _build(self) -> None:
for i, emoji in enumerate(self.emojis):
text = " ".join([emoji.get("label", "")] + emoji.get("tags", []))
for token in self._tokenize(text):
self.index[token].add(i)
def search(self, query: str) -> list[dict]:
tokens = self._tokenize(query)
if not tokens:
return self.emojis
# Prefix match
candidates: set[int] | None = None
for token in tokens:
matches: set[int] = set()
for key, indices in self.index.items():
if key.startswith(token):
matches |= indices
candidates = matches if candidates is None else candidates & matches
if candidates is None:
return []
return [self.emojis[i] for i in sorted(candidates)]
Explore More on EmojiFYI
- Search emoji on EmojiFYI: Search
- View usage statistics and popular emoji: Stats
- Analyze emoji sequences in detail: Sequence Analyzer
- Access the full emoji dataset: API Reference