parent
02d8cf6e9a
commit
0613232202
7 changed files with 32823 additions and 2591 deletions
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,113 +0,0 @@ |
||||
#!/usr/bin/perl -w |
||||
# |
||||
# update_emoji.pl |
||||
# |
||||
# This script generates the emoji.plugin.zsh emoji definitions from the Unicode |
||||
# character data for the emoji characters. |
||||
# |
||||
# The data file can be found at https://unicode.org/Public/emoji/latest/emoji-data.txt |
||||
# as referenced in Unicode TR51 (https://www.unicode.org/reports/tr51/index.html). |
||||
# |
||||
# This is known to work with the data file from version 1.0. It may not work with later |
||||
# versions if the format changes. In particular, this reads line comments to get the |
||||
# emoji character name and unicode version. |
||||
# |
||||
# Country names have punctuation and other non-letter characters removed from their name, |
||||
# to avoid possible complications with having to escape the strings when using them as |
||||
# array subscripts. The definition file seems to use some combining characters like accents |
||||
# that get stripped during this process. |
||||
|
||||
use strict; |
||||
use warnings; |
||||
use 5.010; |
||||
use autodie; |
||||
|
||||
use Path::Class; |
||||
use File::Copy; |
||||
|
||||
# Parse definitions out of the data file and convert |
||||
sub process_emoji_data_file { |
||||
my ( $infile, $outfilename ) = @_; |
||||
my $file = file($infile); |
||||
my $outfile = file($outfilename); |
||||
my $outfilebase = $outfile->basename(); |
||||
my $tempfilename = "$outfilename.tmp"; |
||||
my $tempfile = file($tempfilename); |
||||
my $outfh = $tempfile->openw(); |
||||
$outfh->print(" |
||||
# $outfilebase - Emoji character definitions for oh-my-zsh emoji plugin |
||||
# |
||||
# This file is auto-generated by update_emoji.pl. Do not edit it manually. |
||||
# |
||||
# This contains the definition for: |
||||
# \$emoji - which maps character names to Unicode characters |
||||
# \$emoji_flags - maps country names to Unicode flag characters using region indicators |
||||
|
||||
# Main emoji |
||||
typeset -gAH emoji |
||||
# National flags |
||||
typeset -gAH emoji_flags |
||||
# Combining modifiers |
||||
typeset -gAH emoji_mod |
||||
|
||||
"); |
||||
|
||||
my $fh = $file->openr(); |
||||
my $line_num = 0; |
||||
while ( my $line = $fh->getline() ) { |
||||
$line_num++; |
||||
$_ = $line; |
||||
# Skip all-comment lines (from the header) and blank lines |
||||
# (But don't strip comments on normal lines; we need to parse those for |
||||
# the emoji names.) |
||||
next if /^\s*#/ or /^\s*$/; |
||||
|
||||
if (/^(\S.*?\S)\s*;\s*(\w+)\s*;\s*(\w+)\s*;\s*(\w+)\s*;\s*(\w.*?)\s*#\s*V(\S+)\s\(.*?\)\s*(\w.*\S)\s*$/) { |
||||
my ($code, $style, $level, $modifier_status, $sources, $version, $keycap_name) |
||||
= ($1, $2, $3, $4, $5, $6, $7); |
||||
#print "code=$code style=$style level=$level modifier_status=$modifier_status sources=$sources version=$version name=$keycap_name\n"; |
||||
my @code_points = split /\s+/, $code; |
||||
my @sources = split /\s+/, $sources; |
||||
|
||||
my $flag_country = ""; |
||||
if ( $keycap_name =~ /^flag for (\S.*?)\s*$/) { |
||||
$flag_country = $1; |
||||
} |
||||
|
||||
my $zsh_code = join '', map { "\\U$_" } @code_points; |
||||
# Convert keycap names to valid associative array names that do not require any |
||||
# quoting. Works fine for most stuff, but is clumsy for flags. |
||||
my $omz_name = lc($keycap_name); |
||||
$omz_name =~ s/[^A-Za-z0-9]/_/g; |
||||
my $zsh_flag_country = $flag_country; |
||||
$zsh_flag_country =~ s/[^\p{Letter}]/_/g; |
||||
if ($flag_country) { |
||||
$outfh->print("emoji_flags[$zsh_flag_country]=\$'$zsh_code'\n"); |
||||
} else { |
||||
$outfh->print("emoji[$omz_name]=\$'$zsh_code'\n"); |
||||
} |
||||
# Modifiers are included in both the main set and their separate map, |
||||
# because they have a standalone representation as a color swatch. |
||||
if ( $modifier_status eq "modifier" ) { |
||||
$outfh->print("emoji_mod[$omz_name]=\$'$zsh_code'\n"); |
||||
} |
||||
} else { |
||||
die "Failed parsing line $line_num: '$_'"; |
||||
} |
||||
} |
||||
$fh->close(); |
||||
$outfh->print("\n"); |
||||
$outfh->close(); |
||||
|
||||
move($tempfilename, $outfilename) |
||||
or die "Failed moving temp file to $outfilename: $!"; |
||||
} |
||||
|
||||
my $datafile = "emoji-data.txt"; |
||||
my $zsh_def_file = "emoji-char-definitions.zsh"; |
||||
process_emoji_data_file($datafile, $zsh_def_file); |
||||
|
||||
print "Updated definition file $zsh_def_file\n"; |
||||
|
||||
|
||||
|
||||
@ -0,0 +1,213 @@ |
||||
""" |
||||
Update Emoji.py |
||||
Refeshes OMZ emoji database based on the latest Unicode spec |
||||
""" |
||||
import re |
||||
import json |
||||
|
||||
spec = open("emoji-data.txt", "r") |
||||
|
||||
# Regexes |
||||
# regex_emoji will return, respectively: |
||||
# the code points, its type (status), the actual emoji, and its official name |
||||
regex_emoji = r"^([\w ].*?\S)\s*;\s*([\w-]+)\s*#\s*(.*?)\s(\S.*).*$" |
||||
# regex_group returns the group of subgroup that a line opens |
||||
regex_group = r"^#\s*(group|subgroup):\s*(.*)$" |
||||
|
||||
headers = """ |
||||
# emoji-char-definitions.zsh - Emoji definitions for oh-my-zsh emoji plugin |
||||
# |
||||
# This file is auto-generated by update_emoji.py. Do not edit it manually. |
||||
# |
||||
# This contains the definition for: |
||||
# $emoji - which maps character names to Unicode characters |
||||
# $emoji_flags - maps country names to Unicode flag characters using region |
||||
# indicators |
||||
# $emoji_mod - maps modifier components to Unicode characters |
||||
# $emoji_groups - a single associative array to avoid cluttering up the |
||||
# global namespace, and to allow adding additional group |
||||
# definitions at run time. The keys are the group names, and |
||||
# the values are whitespace-separated lists of emoji |
||||
# character names. |
||||
|
||||
# Main emoji |
||||
typeset -gAH emoji |
||||
# National flags |
||||
typeset -gAH emoji_flags |
||||
# Combining modifiers |
||||
typeset -gAH emoji_mod |
||||
# Emoji groups |
||||
typeset -gAH emoji_groups |
||||
""" |
||||
|
||||
####### |
||||
# Adding country codes |
||||
####### |
||||
# This is the only part of this script that relies on an external library |
||||
# (country_converter), and is hence commented out by default. |
||||
# You can uncomment it to have country codes added as aliases for flag |
||||
# emojis. (By default, when you install this extension, country codes are |
||||
# included as aliases, but not if you re-run this script without uncommenting.) |
||||
# Warning: country_converter is very verbose, and will print warnings all over |
||||
# your terminal. |
||||
|
||||
# import country_converter as coco # pylint: disable=wrong-import-position |
||||
# cc = coco.CountryConverter() |
||||
|
||||
# def country_iso(_all_names, _omz_name): |
||||
# """ Using the external library country_converter, |
||||
# this funciton can detect the ISO2 and ISO3 codes |
||||
# of the country. It takes as argument the array |
||||
# with all the names of the emoji, and returns that array.""" |
||||
# omz_no_underscore = re.sub(r'_', r' ', _omz_name) |
||||
# iso2 = cc.convert(names=[omz_no_underscore], to='ISO2') |
||||
# if iso2 != 'not found': |
||||
# _all_names.append(iso2) |
||||
# iso3 = cc.convert(names=[omz_no_underscore], to='ISO3') |
||||
# _all_names.append(iso3) |
||||
# return _all_names |
||||
|
||||
|
||||
####### |
||||
# Helper functions |
||||
####### |
||||
|
||||
def code_to_omz(_code_points): |
||||
""" Returns a ZSH-compatible Unicode string from the code point(s) """ |
||||
return r'\U' + r'\U'.join(_code_points.split(' ')) |
||||
|
||||
def name_to_omz(_name, _group, _subgroup, _status): |
||||
""" Returns a reasonable snake_case name for the emoji. """ |
||||
def snake_case(_string): |
||||
""" Does the regex work of snake_case """ |
||||
remove_dots = re.sub(r'\.\(\)', r'', _string) |
||||
replace_ands = re.sub(r'\&', r'and', remove_dots) |
||||
remove_whitespace = re.sub(r'[^\#\*\w]', r'_', replace_ands) |
||||
return re.sub(r'__', r'_', remove_whitespace) |
||||
|
||||
shortname = "" |
||||
split_at_colon = lambda s: s.split(": ") |
||||
# Special treatment by group and subgroup |
||||
# If the emoji is a flag, we strip "flag" from its name |
||||
if _group == "Flags" and len(split_at_colon(_name)) > 1: |
||||
shortname = snake_case(split_at_colon(_name)[1]) |
||||
else: |
||||
shortname = snake_case(_name) |
||||
# Special treatment by status |
||||
# Enables us to have every emoji combination, |
||||
# even the one that are not officially sanctionned |
||||
# and are implemeted by, say, only one vendor |
||||
if _status == "unqualified": |
||||
shortname += "_unqualified" |
||||
elif _status == "minimally-qualified": |
||||
shortname += "_minimally" |
||||
return shortname |
||||
|
||||
def increment_name(_shortname): |
||||
""" Increment the short name by 1. If you get, say, |
||||
'woman_detective_unqualified', it returns |
||||
'woman_detective_unqualified_1', and then |
||||
'woman_detective_unqualified_2', etc. """ |
||||
last_char = _shortname[-1] |
||||
if last_char.isdigit(): |
||||
num = int(last_char) |
||||
return _shortname[:-1] + str(num + 1) |
||||
return _shortname + "_1" |
||||
|
||||
######## |
||||
# Going through every line |
||||
######## |
||||
|
||||
group, subgroup, short_name_buffer = "", "", "" |
||||
emoji_database = [] |
||||
for line in spec: |
||||
# First, test if this line opens a group or subgroup |
||||
group_match = re.findall(regex_group, line) |
||||
if group_match != []: |
||||
gr_or_sub, name = group_match[0] |
||||
if gr_or_sub == "group": |
||||
group = name |
||||
elif gr_or_sub == "subgroup": |
||||
subgroup = name |
||||
continue # Moving on... |
||||
# Second, test if this line references one emoji |
||||
emoji_match = re.findall(regex_emoji, line) |
||||
if emoji_match != []: |
||||
code_points, status, emoji, name = emoji_match[0] |
||||
omz_codes = code_to_omz(code_points) |
||||
omz_name = name_to_omz(name, group, subgroup, status) |
||||
# If this emoji has the same shortname as the preceding one |
||||
if omz_name in short_name_buffer: |
||||
omz_name = increment_name(short_name_buffer) |
||||
short_name_buffer = omz_name |
||||
emoji_database.append( |
||||
[omz_codes, status, emoji, omz_name, group, subgroup]) |
||||
spec.close() |
||||
|
||||
######## |
||||
# Write to emoji-char-definitions.zsh |
||||
######## |
||||
|
||||
# Aliases for emojis are retrieved through the DB of Gemoji |
||||
# Retrieved on Aug 9 2019 from the following URL: |
||||
# https://raw.githubusercontent.com/github/gemoji/master/db/emoji.json |
||||
|
||||
gemoji_db = open("gemoji_db.json") |
||||
j = json.load(gemoji_db) |
||||
aliases_map = {entry['emoji']: entry['aliases'] for entry in j} |
||||
all_omz_names = [emoji_data[3] for emoji_data in emoji_database] |
||||
|
||||
# Let's begin writing to this file |
||||
output = open("emoji-char-definitions.zsh", "w") |
||||
output.write(headers) |
||||
|
||||
emoji_groups = {"fruits": "\n", "vehicles": "\n", "hands": "\n", |
||||
"people": "\n", "animals": "\n", "faces": "\n", |
||||
"flags": "\n"} |
||||
|
||||
# First, write every emoji down |
||||
for _omz_codes, _status, _emoji, _omz_name, _group, _subgroup in emoji_database: |
||||
|
||||
# One emoji can be mapped to multiple names (aliases or country codes) |
||||
names_for_this_emoji = [_omz_name] |
||||
|
||||
# Variable that indicates in which map the emoji will be located |
||||
emoji_map = "emoji" |
||||
if _status == "component": |
||||
emoji_map = "emoji_mod" |
||||
if _group == "Flags": |
||||
emoji_map = "emoji_flags" |
||||
# Adding country codes (Optional, see above) |
||||
# names_for_this_emoji = country_iso(names_for_this_emoji, _omz_name) |
||||
|
||||
# Check if there is an alias available in the Gemoji DB |
||||
if _emoji in aliases_map.keys(): |
||||
for alias in aliases_map[_emoji]: |
||||
if alias not in all_omz_names: |
||||
names_for_this_emoji.append(alias) |
||||
|
||||
# And now we write to the definitions file |
||||
for one_name in names_for_this_emoji: |
||||
output.write(f"{emoji_map}[{one_name}]=$'{_omz_codes}'\n") |
||||
|
||||
# Storing the emoji in defined subgroups for the next step |
||||
if _status == "fully-qualified": |
||||
if _subgroup == "food-fruit": |
||||
emoji_groups["fruits"] += f" {_omz_name}\n" |
||||
elif "transport-" in _subgroup: |
||||
emoji_groups["vehicles"] += f" {_omz_name}\n" |
||||
elif "hand-" in _subgroup: |
||||
emoji_groups["hands"] += f" {_omz_name}\n" |
||||
elif "person-" in _subgroup or _subgroup == "family": |
||||
emoji_groups["people"] += f" {_omz_name}\n" |
||||
elif "animal-" in _subgroup: |
||||
emoji_groups["animals"] += f" {_omz_name}\n" |
||||
elif "face-" in _subgroup: |
||||
emoji_groups["faces"] += f" {_omz_name}\n" |
||||
elif _group == "Flags": |
||||
emoji_groups["flags"] += f" {_omz_name}\n" |
||||
|
||||
# Second, write the subgroups to the end of the file |
||||
for name, string in emoji_groups.items(): |
||||
output.write(f'\nemoji_groups[{name}]="{string}"\n') |
||||
output.close() |
||||
Loading…
Reference in new issue