#!/usr/bin/env python

import re
import json

# https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
# http://stackoverflow.com/a/13436167/96656
def unisymbol(codePoint):
	if codePoint >= 0x0000 and codePoint <= 0xFFFF:
		return unichr(codePoint)
	elif codePoint >= 0x010000 and codePoint <= 0x10FFFF:
		highSurrogate = int((codePoint - 0x10000) / 0x400) + 0xD800
		lowSurrogate = int((codePoint - 0x10000) % 0x400) + 0xDC00
		return unichr(highSurrogate) + unichr(lowSurrogate)
	else:
		return 'Error'

def hexify(codePoint):
	return 'U+' + hex(codePoint)[2:].upper().zfill(6)

def writeFile(filename, contents):
	print filename
	with open(filename, 'w') as f:
		f.write(contents.strip() + '\n')

data = []
for codePoint in range(0x000000, 0x10FFFF + 1):
	# Skip non-scalar values.
	if codePoint >= 0xD800 and codePoint <= 0xDFFF:
		continue
	symbol = unisymbol(codePoint)
	# http://stackoverflow.com/a/17199950/96656
	bytes = symbol.encode('utf8').decode('latin1')
	data.append({
		'codePoint': codePoint,
		'decoded': symbol,
		'encoded': bytes
	});

jsonData = json.dumps(data, sort_keys=False, indent=2, separators=(',', ': '))
# Use tabs instead of double spaces for indentation
jsonData = jsonData.replace('  ', '\t')
# Escape hexadecimal digits in escape sequences
jsonData = re.sub(
	r'\\u([a-fA-F0-9]{4})',
	lambda match: r'\u{}'.format(match.group(1).upper()),
	jsonData
)

writeFile('data.json', jsonData)