| 
									
										
										
										
											2021-08-22 08:17:58 -07:00
										 |  |  | #!/usr/bin/env python3 | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  | # This script is based on | 
					
						
							|  |  |  | # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py | 
					
						
							|  |  |  | # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  | # This script uses the following Unicode tables: | 
					
						
							|  |  |  | # - UnicodeData.txt | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from collections import namedtuple | 
					
						
							|  |  |  | import csv | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import subprocess | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | NUM_CODEPOINTS=0x110000 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def to_ranges(iter): | 
					
						
							|  |  |  |     current = None | 
					
						
							|  |  |  |     for i in iter: | 
					
						
							|  |  |  |         if current is None or i != current[1] or i in (0x10000, 0x20000): | 
					
						
							|  |  |  |             if current is not None: | 
					
						
							|  |  |  |                 yield tuple(current) | 
					
						
							|  |  |  |             current = [i, i + 1] | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             current[1] += 1 | 
					
						
							|  |  |  |     if current is not None: | 
					
						
							|  |  |  |         yield tuple(current) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_escaped(codepoints): | 
					
						
							|  |  |  |     for c in codepoints: | 
					
						
							|  |  |  |         if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): | 
					
						
							|  |  |  |             yield c.value | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_file(f): | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         return open(os.path.basename(f)) | 
					
						
							|  |  |  |     except FileNotFoundError: | 
					
						
							|  |  |  |         subprocess.run(["curl", "-O", f], check=True) | 
					
						
							|  |  |  |         return open(os.path.basename(f)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Codepoint = namedtuple('Codepoint', 'value class_') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_codepoints(f): | 
					
						
							|  |  |  |     r = csv.reader(f, delimiter=";") | 
					
						
							|  |  |  |     prev_codepoint = 0 | 
					
						
							|  |  |  |     class_first = None | 
					
						
							|  |  |  |     for row in r: | 
					
						
							|  |  |  |         codepoint = int(row[0], 16) | 
					
						
							|  |  |  |         name = row[1] | 
					
						
							|  |  |  |         class_ = row[2] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if class_first is not None: | 
					
						
							|  |  |  |             if not name.endswith("Last>"): | 
					
						
							|  |  |  |                 raise ValueError("Missing Last after First") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for c in range(prev_codepoint + 1, codepoint): | 
					
						
							|  |  |  |             yield Codepoint(c, class_first) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         class_first = None | 
					
						
							|  |  |  |         if name.endswith("First>"): | 
					
						
							|  |  |  |             class_first = class_ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         yield Codepoint(codepoint, class_) | 
					
						
							|  |  |  |         prev_codepoint = codepoint | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if class_first is not None: | 
					
						
							|  |  |  |         raise ValueError("Missing Last after First") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for c in range(prev_codepoint + 1, NUM_CODEPOINTS): | 
					
						
							|  |  |  |         yield Codepoint(c, None) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def compress_singletons(singletons): | 
					
						
							|  |  |  |     uppers = [] # (upper, # items in lowers) | 
					
						
							|  |  |  |     lowers = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for i in singletons: | 
					
						
							|  |  |  |         upper = i >> 8 | 
					
						
							|  |  |  |         lower = i & 0xff | 
					
						
							|  |  |  |         if len(uppers) == 0 or uppers[-1][0] != upper: | 
					
						
							|  |  |  |             uppers.append((upper, 1)) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             upper, count = uppers[-1] | 
					
						
							|  |  |  |             uppers[-1] = upper, count + 1 | 
					
						
							|  |  |  |         lowers.append(lower) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return uppers, lowers | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def compress_normal(normal): | 
					
						
							|  |  |  |     # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f | 
					
						
							|  |  |  |     # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff | 
					
						
							|  |  |  |     compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     prev_start = 0 | 
					
						
							|  |  |  |     for start, count in normal: | 
					
						
							|  |  |  |         truelen = start - prev_start | 
					
						
							|  |  |  |         falselen = count | 
					
						
							|  |  |  |         prev_start = start + count | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         assert truelen < 0x8000 and falselen < 0x8000 | 
					
						
							|  |  |  |         entry = [] | 
					
						
							|  |  |  |         if truelen > 0x7f: | 
					
						
							|  |  |  |             entry.append(0x80 | (truelen >> 8)) | 
					
						
							|  |  |  |             entry.append(truelen & 0xff) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             entry.append(truelen & 0x7f) | 
					
						
							|  |  |  |         if falselen > 0x7f: | 
					
						
							|  |  |  |             entry.append(0x80 | (falselen >> 8)) | 
					
						
							|  |  |  |             entry.append(falselen & 0xff) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             entry.append(falselen & 0x7f) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         compressed.append(entry) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return compressed | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def print_singletons(uppers, lowers, uppersname, lowersname): | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |     print("  static constexpr singleton {}[] = {{".format(uppersname)) | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  |     for u, c in uppers: | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |         print("    {{{:#04x}, {}}},".format(u, c)) | 
					
						
							|  |  |  |     print("  };") | 
					
						
							|  |  |  |     print("  static constexpr unsigned char {}[] = {{".format(lowersname)) | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  |     for i in range(0, len(lowers), 8): | 
					
						
							|  |  |  |         print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8]))) | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |     print("  };") | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | def print_normal(normal, normalname): | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |     print("  static constexpr unsigned char {}[] = {{".format(normalname)) | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  |     for v in normal: | 
					
						
							|  |  |  |         print("    {}".format(" ".join("{:#04x},".format(i) for i in v))) | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |     print("  };") | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | def main(): | 
					
						
							|  |  |  |     file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     codepoints = get_codepoints(file) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     CUTOFF=0x10000 | 
					
						
							|  |  |  |     singletons0 = [] | 
					
						
							|  |  |  |     singletons1 = [] | 
					
						
							|  |  |  |     normal0 = [] | 
					
						
							|  |  |  |     normal1 = [] | 
					
						
							|  |  |  |     extra = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for a, b in to_ranges(get_escaped(codepoints)): | 
					
						
							|  |  |  |         if a > 2 * CUTOFF: | 
					
						
							|  |  |  |             extra.append((a, b - a)) | 
					
						
							|  |  |  |         elif a == b - 1: | 
					
						
							|  |  |  |             if a & CUTOFF: | 
					
						
							|  |  |  |                 singletons1.append(a & ~CUTOFF) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 singletons0.append(a) | 
					
						
							|  |  |  |         elif a == b - 2: | 
					
						
							|  |  |  |             if a & CUTOFF: | 
					
						
							|  |  |  |                 singletons1.append(a & ~CUTOFF) | 
					
						
							|  |  |  |                 singletons1.append((a + 1) & ~CUTOFF) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 singletons0.append(a) | 
					
						
							|  |  |  |                 singletons0.append(a + 1) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             if a >= 2 * CUTOFF: | 
					
						
							|  |  |  |                 extra.append((a, b - a)) | 
					
						
							|  |  |  |             elif a & CUTOFF: | 
					
						
							|  |  |  |                 normal1.append((a & ~CUTOFF, b - a)) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 normal0.append((a, b - a)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     singletons0u, singletons0l = compress_singletons(singletons0) | 
					
						
							|  |  |  |     singletons1u, singletons1l = compress_singletons(singletons1) | 
					
						
							|  |  |  |     normal0 = compress_normal(normal0) | 
					
						
							|  |  |  |     normal1 = compress_normal(normal1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print("""\
 | 
					
						
							| 
									
										
										
										
											2022-01-30 10:55:28 -06:00
										 |  |  | FMT_FUNC auto is_printable(uint32_t cp) -> bool {\ | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  | """)
 | 
					
						
							| 
									
										
										
										
											2021-08-22 15:14:13 -07:00
										 |  |  |     print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower') | 
					
						
							|  |  |  |     print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower') | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |     print_normal(normal0, 'normal0') | 
					
						
							|  |  |  |     print_normal(normal1, 'normal1') | 
					
						
							|  |  |  |     print("""\
 | 
					
						
							|  |  |  |   auto lower = static_cast<uint16_t>(cp); | 
					
						
							|  |  |  |   if (cp < 0x10000) { | 
					
						
							| 
									
										
										
										
											2021-08-22 15:14:13 -07:00
										 |  |  |     return is_printable(lower, singletons0, | 
					
						
							|  |  |  |                         sizeof(singletons0) / sizeof(*singletons0), | 
					
						
							| 
									
										
										
										
											2021-08-22 13:23:38 -07:00
										 |  |  |                         singletons0_lower, normal0, sizeof(normal0)); | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |   } | 
					
						
							|  |  |  |   if (cp < 0x20000) { | 
					
						
							| 
									
										
										
										
											2021-08-22 15:14:13 -07:00
										 |  |  |     return is_printable(lower, singletons1, | 
					
						
							|  |  |  |                         sizeof(singletons1) / sizeof(*singletons1), | 
					
						
							| 
									
										
										
										
											2021-08-22 13:23:38 -07:00
										 |  |  |                         singletons1_lower, normal1, sizeof(normal1)); | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |   }\ | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  | """)
 | 
					
						
							|  |  |  |     for a, b in extra: | 
					
						
							| 
									
										
										
										
											2021-08-22 09:10:10 -07:00
										 |  |  |         print("  if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b)) | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  |     print("""\
 | 
					
						
							| 
									
										
										
										
											2021-08-22 15:14:13 -07:00
										 |  |  |   return cp < 0x{:x}; | 
					
						
							|  |  |  | }}\ | 
					
						
							|  |  |  | """.format(NUM_CODEPOINTS))
 | 
					
						
							| 
									
										
										
										
											2021-08-22 08:15:34 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  |     main() |