keyd/scripts/generate_xcompose

#!/usr/bin/env python3

import sys

codes = []
for line in open('data/unicode.txt').readlines(): # Original source: https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt
    try:
        code = int(line.split(';')[0], 16)

        # Ensure the character is encodable (surrogates are not)
        chr(code).encode('utf8')

        if (code >= 128):
            codes.append(code)
    except:
        pass

# We use the base36 encoded index as the compose sequence to minimize
# the total number of keysyms required.
def base36(n):
	chars = '0123456789abcdefghijklmnopqrstuvwxyz'

	s = ''

	s += chars[n // (len(chars)*len(chars)) % len(chars)]
	s += chars[n // len(chars) % len(chars)]
	s += chars[n % len(chars)]

	return s

# Generate the compose file

data = ''
for n, code in enumerate(codes):
        data += '<Cancel> '
        data += ' '.join(f'<{c}>' for c in base36(n))
        data += f' : "{chr(code)}"\n'

open('data/keyd.compose', 'w').write(data)

# Generate the corresponding src/unicode.c

# OPT: We could condense this and shave off lookup time by using an offset
# table to capitalize on codepoint contiguity, but 35k is small enough to
# warrant keeping the entire thing in memory.

open('src/unicode.c', 'w').write(f'''
	/* GENERATED BY {sys.argv[0]}, DO NOT MODIFY BY HAND. */

	#include <stdint.h>
	#include <stdlib.h>

	uint32_t unicode_table[] = {{ {','.join(map(str, codes))} }};

	int lookup_xcompose_index(uint32_t codepoint) {{
		size_t i = 0;

		for(i = 0; i < sizeof(unicode_table)/sizeof(unicode_table[0]); i++) {{
			if (unicode_table[i] == codepoint)
				return i;
		}}

		return -1;
	}}
'''
.replace('\n\t', '\n')
.lstrip()
)