Unicode has an entire document on this. Emojis and emoji sequences are a lot more complicated than just a few character ranges. There are emoji modifiers (for example, skin tones), regional indicator pairs (country flags), and some special sequences like the pirate flag.
You can use Unicode’s emoji data files to reliably find emoji characters and emoji sequences. This will work even as new complex emojis are added:
import java.net.URL;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
public class EmojiCollector {
private static String emojiSequencesBaseURI;
private final Pattern emojiPattern;
public EmojiCollector()
throws IOException {
StringBuilder sequences = new StringBuilder();
appendSequencesFrom(
uriOfEmojiSequencesFile("emoji-sequences.txt"),
sequences);
appendSequencesFrom(
uriOfEmojiSequencesFile("emoji-zwj-sequences.txt"),
sequences);
emojiPattern = Pattern.compile(sequences.toString());
}
private void appendSequencesFrom(String sequencesFileURI,
StringBuilder sequences)
throws IOException {
Path sequencesFile = download(sequencesFileURI);
Pattern range =
Pattern.compile("^(\\p{XDigit}{4,6})\\.\\.(\\p{XDigit}{4,6})");
Matcher rangeMatcher = range.matcher("");
try (BufferedReader sequencesReader =
Files.newBufferedReader(sequencesFile)) {
String line;
while ((line = sequencesReader.readLine()) != null) {
if (line.trim().isEmpty() || line.startsWith("#")) {
continue;
}
int semicolon = line.indexOf(';');
if (semicolon < 0) {
continue;
}
String codepoints = line.substring(0, semicolon);
if (sequences.length() > 0) {
sequences.append("|");
}
if (rangeMatcher.reset(codepoints).find()) {
String start = rangeMatcher.group(1);
String end = rangeMatcher.group(2);
sequences.append("[\\x{").append(start).append("}");
sequences.append("-\\x{").append(end).append("}]");
} else {
Scanner scanner = new Scanner(codepoints);
while (scanner.hasNext()) {
String codepoint = scanner.next();
sequences.append("\\x{").append(codepoint).append("}");
}
}
}
}
}
private static String uriOfEmojiSequencesFile(String baseName)
throws IOException {
if (emojiSequencesBaseURI == null) {
URL readme = new URL(
"https://www.unicode.org/Public/UCD/latest/ReadMe.txt");
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(readme.openStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
if (line.startsWith("Public/emoji/")) {
emojiSequencesBaseURI =
"https://www.unicode.org/" + line.trim();
if (!emojiSequencesBaseURI.endsWith("/")) {
emojiSequencesBaseURI += "/";
}
break;
}
}
}
if (emojiSequencesBaseURI == null) {
// Where else can we get this reliably?
String version = "15.0";
emojiSequencesBaseURI =
"https://www.unicode.org/Public/emoji/" + version + "/";
}
}
return emojiSequencesBaseURI + baseName;
}
private static Path download(String uri)
throws IOException {
Path cacheDir;
String os = System.getProperty("os.name");
String home = System.getProperty("user.home");
if (os.contains("Windows")) {
Path appDataDir;
String appData = System.getenv("APPDATA");
if (appData != null) {
appDataDir = Paths.get(appData);
} else {
appDataDir = Paths.get(home, "AppData");
}
cacheDir = appDataDir.resolve("Local");
} else if (os.contains("Mac")) {
cacheDir = Paths.get(home, "Library", "Application Support");
} else {
cacheDir = Paths.get(home, ".cache");
String cacheHome = System.getenv("XDG_CACHE_HOME");
if (cacheHome != null) {
Path dir = Paths.get(cacheHome);
if (dir.isAbsolute()) {
cacheDir = dir;
}
}
}
String baseName = uri.substring(uri.lastIndexOf('/') + 1);
Path dataDir = cacheDir.resolve(EmojiCollector.class.getName());
Path dataFile = dataDir.resolve(baseName);
if (!Files.isReadable(dataFile)) {
Files.createDirectories(dataDir);
URL dataURL = new URL(uri);
try (InputStream data = dataURL.openStream()) {
Files.copy(data, dataFile);
}
}
return dataFile;
}
public Collection<String> getEmojisIn(String letters) {
Collection<String> emoticons = new ArrayList<>();
Matcher emojiMatcher = emojiPattern.matcher(letters);
while (emojiMatcher.find()) {
emoticons.add(emojiMatcher.group());
}
return emoticons;
}
public static void main(String[] args)
throws IOException {
EmojiCollector collector = new EmojiCollector();
for (String arg : args) {
Collection<String> emojis = collector.getEmojisIn(arg);
System.out.println(arg + " => " + String.join("", emojis));
}
}
}