Implementing EmojiEmoji
A Japanese word (็ตตๆๅญ) meaning 'picture character' โ small graphical symbols used in digital communication to express ideas, emotions, and objects. Search
A good emoji search feels instant and returns relevant results even for approximate queries. Typing "smile" should surface ๐ before it surfaces ๐งช. Typing "rkt" should still find ๐. This guide covers the data, algorithms, and trade-offs involved in building emoji search from scratch.
The Data Foundation: UnicodeUnicode
Universal character encoding standard that assigns a unique number to every character across all writing systems and symbol sets, including emoji. CLDRCLDR (CLDR)
The Common Locale Data Repository, a Unicode project providing locale-specific data including emoji names and search keywords in 100+ languages.
The Unicode ConsortiumUnicode Consortium
The non-profit organization that develops and maintains the Unicode Standard, including the process for adding new emoji.'s Common Locale Data Repository (CLDR) provides the best source of emoji search data. It includes:
- Short names: A concise English description (e.g., "grinning face")
- Keywords/annotations: Synonyms and related terms (e.g., "face | grin | happy")
- Localized annotations: The same in 60+ languages
The emoji data from CLDR is available via the @unicode/cldr-annotations package or pre-processed in libraries like emojibase-data.
npm install emojibase-data
import data from 'emojibase-data/en/data.json';
import annotations from 'emojibase-data/en/messages.json';
// data[0] example:
// {
// label: "grinning face",
// tags: ["face", "grin", "happy"],
// emoji: "๐",
// hexcode: "1F600",
// group: 0,
// order: 1,
// version: 1.0
// }
Building an Inverted Index
For fast local search over 3,000+ emoji, build an inverted index at startup time:
class EmojiSearchIndex {
constructor(emojis) {
this.emojis = emojis;
this.index = this._buildIndex(emojis);
}
_buildIndex(emojis) {
const index = new Map(); // token โ Set of emoji indices
emojis.forEach((emoji, i) => {
const tokens = this._tokenize([emoji.label, ...(emoji.tags || [])].join(' '));
for (const token of tokens) {
if (!index.has(token)) index.set(token, new Set());
index.get(token).add(i);
}
});
return index;
}
_tokenize(text) {
return text
.toLowerCase()
.replace(/[^a-z0-9 ]/g, ' ')
.split(/\s+/)
.filter(t => t.length > 1);
}
search(query) {
const tokens = this._tokenize(query);
if (tokens.length === 0) return this.emojis;
// Intersection of sets for multi-token queries
let candidates = null;
for (const token of tokens) {
// Prefix matching: find all index keys that start with this token
const matches = new Set();
for (const [key, indices] of this.index.entries()) {
if (key.startsWith(token)) {
for (const idx of indices) matches.add(idx);
}
}
candidates = candidates === null
? matches
: new Set([...candidates].filter(x => matches.has(x)));
}
return candidates
? [...candidates].map(i => this.emojis[i])
: [];
}
}
const index = new EmojiSearchIndex(data);
console.log(index.search("rocket").map(e => e.emoji));
// ['๐', '๐ธ', '๐ฉ๏ธ']
Relevance Ranking
Raw set intersection returns results in arbitrary order. Add scoring to rank by relevance:
function scoreEmoji(emoji, query) {
const q = query.toLowerCase();
const label = emoji.label.toLowerCase();
const tags = (emoji.tags || []).map(t => t.toLowerCase());
let score = 0;
// Exact label match: highest score
if (label === q) score += 100;
// Label starts with query
else if (label.startsWith(q)) score += 50;
// Label contains query
else if (label.includes(q)) score += 20;
// Tag exact match
if (tags.includes(q)) score += 30;
// Tag prefix match
else if (tags.some(t => t.startsWith(q))) score += 10;
// Frequency bonus (if you track usage)
score += (emoji.usageCount || 0) * 0.1;
return score;
}
function rankedSearch(index, query) {
const results = index.search(query);
return results
.map(emoji => ({ emoji, score: scoreEmoji(emoji, query) }))
.sort((a, b) => b.score - a.score)
.map(r => r.emoji);
}
Fuzzy Matching
For typo tolerance, add fuzzy matching as a fallback when exact/prefix search returns few results:
Levenshtein Distance
function levenshtein(a, b) {
const m = a.length, n = b.length;
const dp = Array.from({ length: m + 1 }, (_, i) =>
Array.from({ length: n + 1 }, (_, j) => (i === 0 ? j : j === 0 ? i : 0))
);
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
dp[i][j] = a[i - 1] === b[j - 1]
? dp[i - 1][j - 1]
: 1 + Math.min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]);
}
}
return dp[m][n];
}
function fuzzyEmojiSearch(emojis, query, maxDistance = 2) {
const q = query.toLowerCase();
return emojis
.map(emoji => {
const dist = Math.min(
levenshtein(q, emoji.label),
...(emoji.tags || []).map(t => levenshtein(q, t))
);
return { emoji, dist };
})
.filter(r => r.dist <= maxDistance)
.sort((a, b) => a.dist - b.dist)
.map(r => r.emoji);
}
Using Fuse.js for Production
import Fuse from 'fuse.js';
const fuse = new Fuse(data, {
keys: [
{ name: 'label', weight: 0.7 },
{ name: 'tags', weight: 0.3 },
],
threshold: 0.4, // 0 = exact, 1 = match anything
includeScore: true,
minMatchCharLength: 2,
useExtendedSearch: true,
});
// Basic search
const results = fuse.search('smiling');
console.log(results.map(r => r.item.emoji)); // ['๐', '๐', '๐', ...]
// Extended search: require "smile" AND start with "happy"
const advanced = fuse.search("'smile ^happy");
Handling Multi-Word Queries
Multi-word queries need special treatment to avoid over-filtering:
function multiWordSearch(index, query) {
const words = query.trim().toLowerCase().split(/\s+/);
if (words.length === 1) {
return rankedSearch(index, query);
}
// Try full phrase first
let results = rankedSearch(index, query);
// Fall back to OR logic if too few results
if (results.length < 5) {
const perWord = words.map(w => new Set(rankedSearch(index, w).map(e => e.emoji)));
const union = new Set([...perWord].flatMap(s => [...s]));
results = index.emojis.filter(e => union.has(e.emoji));
}
return results;
}
Emoji Aliasing and Synonyms
CLDR tags are good but incomplete. Augment with hand-curated synonyms for common queries:
const ALIASES = {
'lol': ['๐', '๐คฃ'],
'love': ['โค๏ธ', '๐', '๐ฅฐ'],
'cool': ['๐', '๐'],
'fire': ['๐ฅ'],
'goat': ['๐', '๐'], // "Greatest of All Time"
'ship': ['๐ข', 'โค๏ธ'], // shipping (romance)
'pray': ['๐'],
'muscle': ['๐ช'],
'money': ['๐ฐ', '๐ต', '๐ค'],
'party': ['๐', '๐ฅณ', '๐'],
};
function aliasLookup(query) {
return ALIASES[query.toLowerCase()] || [];
}
function fullSearch(index, query) {
const aliased = aliasLookup(query);
const searched = rankedSearch(index, query);
// Merge: aliases first (they are intentional matches), then search results
const seen = new Set(aliased);
const deduped = [
...aliased,
...searched.filter(e => !seen.has(e.emoji)).map(e => e.emoji),
];
return deduped.map(emoji => index.emojis.find(e => e.emoji === emoji)).filter(Boolean);
}
Python Implementation
from collections import defaultdict
import re
class EmojiSearchIndex:
def __init__(self, emojis: list[dict]) -> None:
self.emojis = emojis
self.index: dict[str, set[int]] = defaultdict(set)
self._build()
def _tokenize(self, text: str) -> list[str]:
return [t for t in re.split(r'\W+', text.lower()) if len(t) > 1]
def _build(self) -> None:
for i, emoji in enumerate(self.emojis):
text = " ".join([emoji.get("label", "")] + emoji.get("tags", []))
for token in self._tokenize(text):
self.index[token].add(i)
def search(self, query: str) -> list[dict]:
tokens = self._tokenize(query)
if not tokens:
return self.emojis
# Prefix match
candidates: set[int] | None = None
for token in tokens:
matches: set[int] = set()
for key, indices in self.index.items():
if key.startswith(token):
matches |= indices
candidates = matches if candidates is None else candidates & matches
if candidates is None:
return []
return [self.emojis[i] for i in sorted(candidates)]
Explore More on EmojiFYI
- Search emoji on EmojiFYI: Search
- View usage statistics and popular emoji: Stats
- Analyze emoji sequences in detail: Sequence Analyzer
- Access the full emoji dataset: API Reference