#!/usr/bin/env python3

import argparse, collections, datetime, os, re, sys, unicodedata
from urllib.request import urlopen

# Use intranges.intranges_from_list() from the sibling idna directory
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "idna"))
from intranges import intranges_from_list

if sys.version_info[0] < 3:
    print("Only Python 3 supported.")
    sys.exit(2)

PREFERRED_VERSION = "16.0.0"
UCD_URL = "http://www.unicode.org/Public/{version}/ucd/{filename}"
UTS46_URL = "http://www.unicode.org/Public/idna/{version}/{filename}"

DEFAULT_CACHE_DIR = "~/.cache/unidata"

# Scripts affected by IDNA contextual rules
SCRIPT_WHITELIST = sorted(["Greek", "Han", "Hebrew", "Hiragana", "Katakana"])

# Used to piece apart UTS#46 data for Jython compatibility
UTS46_SEGMENT_SIZE = 100

UTS46_STATUSES = {
    "valid": ("V", False),
    "ignored": ("I", False),
    "mapped": ("M", True),
    "deviation": ("D", True),
    "disallowed": ("X", False),
}

# Exceptions are manually assigned in Section 2.6 of RFC 5892.
exceptions = {
    0x00DF: "PVALID",  # LATIN SMALL LETTER SHARP S
    0x03C2: "PVALID",  # GREEK SMALL LETTER FINAL SIGMA
    0x06FD: "PVALID",  # ARABIC SIGN SINDHI AMPERSAND
    0x06FE: "PVALID",  # ARABIC SIGN SINDHI POSTPOSITION MEN
    0x0F0B: "PVALID",  # TIBETAN MARK INTERSYLLABIC TSHEG
    0x3007: "PVALID",  # IDEOGRAPHIC NUMBER ZERO
    0x00B7: "CONTEXTO",  # MIDDLE DOT
    0x0375: "CONTEXTO",  # GREEK LOWER NUMERAL SIGN (KERAIA)
    0x05F3: "CONTEXTO",  # HEBREW PUNCTUATION GERESH
    0x05F4: "CONTEXTO",  # HEBREW PUNCTUATION GERSHAYIM
    0x30FB: "CONTEXTO",  # KATAKANA MIDDLE DOT
    0x0660: "CONTEXTO",  # ARABIC-INDIC DIGIT ZERO
    0x0661: "CONTEXTO",  # ARABIC-INDIC DIGIT ONE
    0x0662: "CONTEXTO",  # ARABIC-INDIC DIGIT TWO
    0x0663: "CONTEXTO",  # ARABIC-INDIC DIGIT THREE
    0x0664: "CONTEXTO",  # ARABIC-INDIC DIGIT FOUR
    0x0665: "CONTEXTO",  # ARABIC-INDIC DIGIT FIVE
    0x0666: "CONTEXTO",  # ARABIC-INDIC DIGIT SIX
    0x0667: "CONTEXTO",  # ARABIC-INDIC DIGIT SEVEN
    0x0668: "CONTEXTO",  # ARABIC-INDIC DIGIT EIGHT
    0x0669: "CONTEXTO",  # ARABIC-INDIC DIGIT NINE
    0x06F0: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT ZERO
    0x06F1: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT ONE
    0x06F2: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT TWO
    0x06F3: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT THREE
    0x06F4: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT FOUR
    0x06F5: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT FIVE
    0x06F6: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT SIX
    0x06F7: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT SEVEN
    0x06F8: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT EIGHT
    0x06F9: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT NINE
    0x0640: "DISALLOWED",  # ARABIC TATWEEL
    0x07FA: "DISALLOWED",  # NKO LAJANYALAN
    0x302E: "DISALLOWED",  # HANGUL SINGLE DOT TONE MARK
    0x302F: "DISALLOWED",  # HANGUL DOUBLE DOT TONE MARK
    0x3031: "DISALLOWED",  # VERTICAL KANA REPEAT MARK
    0x3032: "DISALLOWED",  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK
    0x3033: "DISALLOWED",  # VERTICAL KANA REPEAT MARK UPPER HALF
    0x3034: "DISALLOWED",  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA
    0x3035: "DISALLOWED",  # VERTICAL KANA REPEAT MARK LOWER HALF
    0x303B: "DISALLOWED",  # VERTICAL IDEOGRAPHIC ITERATION MARK
}
backwardscompatible = {}


def hexrange(start, end):
    return range(int(start, 16), int(end, 16) + 1)


def hexvalue(value):
    return int(value, 16)


_RE_UNICODE = re.compile("\\\\u([0-9a-fA-F]{4})")
_RE_SURROGATE = re.compile("[\ud800-\udbff][\udc00-\udfff]")


def unicode_fixup(string):
    """Replace backslash-u-XXXX with appropriate unicode characters."""
    return _RE_SURROGATE.sub(
        lambda match: chr((ord(match.group(0)[0]) - 0xD800) * 0x400 + ord(match.group(0)[1]) - 0xDC00 + 0x10000),
        _RE_UNICODE.sub(lambda match: chr(int(match.group(1), 16)), string),
    )


class UnicodeVersion(object):
    def __init__(self, version):
        result = re.match(r"^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$", version)
        if result:
            self.major = int(result.group("major"))
            self.minor = int(result.group("minor"))
            self.patch = int(result.group("patch"))
            self.numerical = (self.major << 8) + (self.minor << 4) + self.patch
            self.latest = False
        elif version == "latest":
            self.latest = True
        else:
            raise ValueError("Unrecognized Unicode version")

    def __repr__(self, with_date=True):
        if self.latest:
            if with_date:
                return "latest@{}".format(datetime.datetime.now().strftime("%Y-%m-%d"))
            else:
                return "latest"
        else:
            return "{}.{}.{}".format(self.major, self.minor, self.patch)

    @property
    def tag(self):
        return self.__repr__(with_date=False)

    def __gt__(self, other):
        if self.latest:
            return True
        return self.numerical > other.numerical

    def __eq__(self, other):
        if self.latest:
            return False
        return self.numerical == other.numerical


class UnicodeData(object):
    def __init__(self, version, cache, args):
        self.version = UnicodeVersion(version)
        self.system_version = UnicodeVersion(unicodedata.unidata_version)
        self.source = args.source
        self.cache = cache
        self.max = 0

        if self.system_version < self.version:
            print(
                "Warning: Character stability not guaranteed as Python Unicode data {} older than requested {}".format(
                    self.system_version, self.version
                )
            )

        self._load_unicodedata()
        self._load_proplist()
        self._load_derivedcoreprops()
        self._load_blocks()
        self._load_casefolding()
        self._load_hangulst()
        self._load_arabicshaping()
        self._load_scripts()
        self._load_uts46mapping()
        self._load_uts46testvectors()

    def _load_unicodedata(self):
        f_ud = self._ucdfile("UnicodeData.txt")
        self.ucd_data = {}
        range_begin = None
        for line in f_ud.splitlines():
            fields = line.split(";")
            value = int(fields[0], 16)
            start_marker = re.match("^<(?P<name>.*?), First>$", fields[1])
            end_marker = re.match("^<(?P<name>.*?), Last>$", fields[1])
            if start_marker:
                range_begin = value
            elif end_marker:
                for i in range(range_begin, value + 1):
                    fields[1] = "<{}>".format(end_marker.group("name"))
                    self.ucd_data[i] = fields[1:]
                range_begin = None
            else:
                self.ucd_data[value] = fields[1:]

    def _load_proplist(self):
        f_pl = self._ucdfile("PropList.txt")
        self.ucd_props = collections.defaultdict(list)
        for line in f_pl.splitlines():
            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line)
            if result:
                if result.group("end"):
                    for i in hexrange(result.group("start"), result.group("end")):
                        self.ucd_props[i].append(result.group("prop"))
                else:
                    i = hexvalue(result.group("start"))
                    self.ucd_props[i].append(result.group("prop"))

    def _load_derivedcoreprops(self):
        f_dcp = self._ucdfile("DerivedCoreProperties.txt")
        for line in f_dcp.splitlines():
            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line)
            if result:
                if result.group("end"):
                    for i in hexrange(result.group("start"), result.group("end")):
                        self.ucd_props[i].append(result.group("prop"))
                else:
                    i = hexvalue(result.group("start"))
                    self.ucd_props[i].append(result.group("prop"))

    def _load_blocks(self):
        self.ucd_block = {}
        f_b = self._ucdfile("Blocks.txt")
        for line in f_b.splitlines():
            result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$", line)
            if result:
                for i in hexrange(result.group("start"), result.group("end")):
                    self.ucd_block[i] = result.group("block")
                    self.max = max(self.max, i)

    def _load_casefolding(self):
        self.ucd_cf = {}
        f_cf = self._ucdfile("CaseFolding.txt")
        for line in f_cf.splitlines():
            result = re.match(r"^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*", line)
            if result:
                if result.group("type") in ("C", "F"):
                    self.ucd_cf[int(result.group("cp"), 16)] = "".join(
                        [chr(int(x, 16)) for x in result.group("subst").split(" ")]
                    )

    def _load_hangulst(self):
        self.ucd_hst = {}
        f_hst = self._ucdfile("HangulSyllableType.txt")
        for line in f_hst.splitlines():
            result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$", line)
            if result:
                for i in hexrange(result.group("start"), result.group("end")):
                    self.ucd_hst[i] = result.group("type")

    def _load_arabicshaping(self):
        self.ucd_as = {}
        f_as = self._ucdfile("extracted/DerivedJoiningType.txt")
        for line in f_as.splitlines():
            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<jt>\S+)\s*(|\#.*)$", line)
            if result:
                if result.group("end"):
                    for i in hexrange(result.group("start"), result.group("end")):
                        self.ucd_as[i] = result.group("jt")
                else:
                    i = hexvalue(result.group("start"))
                    self.ucd_as[i] = result.group("jt")

    def _load_scripts(self):
        self.ucd_s = {}
        f_s = self._ucdfile("Scripts.txt")
        for line in f_s.splitlines():
            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$", line)
            if result:
                if not result.group("script") in self.ucd_s:
                    self.ucd_s[result.group("script")] = set()
                if result.group("end"):
                    for i in hexrange(result.group("start"), result.group("end")):
                        self.ucd_s[result.group("script")].add(i)
                else:
                    i = hexvalue(result.group("start"))
                    self.ucd_s[result.group("script")].add(i)

    def _load_uts46mapping(self):
        self.ucd_idnamt = {}
        f_idnamt = self._ucdfile("IdnaMappingTable.txt", urlbase=UTS46_URL)
        for line in f_idnamt.splitlines():
            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)", line)
            if result:
                fields = [x.strip() for x in result.group("fields").split(";")]
                if result.group("end"):
                    for i in hexrange(result.group("start"), result.group("end")):
                        self.ucd_idnamt[i] = fields
                else:
                    i = hexvalue(result.group("start"))
                    self.ucd_idnamt[i] = fields

    def _load_uts46testvectors(self):
        self.ucd_uts46tests = []
        f_uts46tests = self._ucdfile("IdnaTestV2.txt", urlbase=UTS46_URL)
        for lineno, line in enumerate(f_uts46tests.splitlines()):
            if "#" in line:
                line = line.split("#", 1)[0]
            if not line:
                continue
            self.ucd_uts46tests.append((lineno + 1, tuple(field.strip() for field in unicode_fixup(line).split(";"))))

    def _ucdfile(self, filename, urlbase=UCD_URL):
        if self.source:
            f = open("{}/{}".format(self.source, filename))
            return f.read()
        else:
            cache_file = None
            if self.cache:
                cache_file = os.path.expanduser("{}/{}/{}".format(self.cache, self.version.tag, filename))
                if os.path.isfile(cache_file):
                    f = open(cache_file)
                    return f.read()

            version_path = self.version.tag
            if version_path == "latest":
                version_path = "UCD/latest"
            url = urlbase.format(
                version=version_path,
                filename=filename,
            )
            content = urlopen(url).read().decode("utf-8")

            if cache_file:
                if not os.path.isdir(os.path.dirname(cache_file)):
                    os.makedirs(os.path.dirname(cache_file))
                f = open(cache_file, "wb")
                f.write(content.encode("utf-8"))
                f.close()

            return str(content)

    def codepoints(self):
        for i in range(0, self.max + 1):
            yield CodePoint(i, ucdata=self)


class CodePoint:
    def __init__(self, value=None, ucdata=None):
        self.value = value
        self.ucdata = ucdata

    def _casefold(self, s):
        r = ""
        for c in s:
            r += self.ucdata.ucd_cf.get(ord(c), c)
        return r

    @property
    def exception_value(self):
        return exceptions.get(self.value, False)

    @property
    def compat_value(self):
        return backwardscompatible.get(self.value, False)

    @property
    def name(self):
        if self.value in self.ucdata.ucd_data:
            return self.ucdata.ucd_data[self.value][0]
        elif "Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value]:
            return "<noncharacter>"
        else:
            return "<reserved>"

    @property
    def general_category(self):
        return self.ucdata.ucd_data.get(self.value, [None, None])[1]

    @property
    def unassigned(self):
        return not ("Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value] or self.value in self.ucdata.ucd_data)

    @property
    def ldh(self):
        if self.value == 0x002D or self.value in range(0x0030, 0x0039 + 1) or self.value in range(0x0061, 0x007A + 1):
            return True
        return False

    @property
    def join_control(self):
        return "Join_Control" in self.ucdata.ucd_props[self.value]

    @property
    def joining_type(self):
        return self.ucdata.ucd_as.get(self.value, None)

    @property
    def char(self):
        return chr(self.value)

    @property
    def nfkc_cf(self):
        return unicodedata.normalize("NFKC", self._casefold(unicodedata.normalize("NFKC", self.char)))

    @property
    def unstable(self):
        return self.char != self.nfkc_cf

    @property
    def in_ignorableproperties(self):
        for prop in ["Default_Ignorable_Code_Point", "White_Space", "Noncharacter_Code_Point"]:
            if prop in self.ucdata.ucd_props[self.value]:
                return True
        return False

    @property
    def in_ignorableblocks(self):
        return self.ucdata.ucd_block.get(self.value) in (
            "Combining Diacritical Marks for Symbols",
            "Musical Symbols",
            "Ancient Greek Musical Notation",
        )

    @property
    def oldhanguljamo(self):
        return self.ucdata.ucd_hst.get(self.value) in ("L", "V", "T")

    @property
    def in_lettersdigits(self):
        return self.general_category in ("Ll", "Lu", "Lo", "Nd", "Lm", "Mn", "Mc")

    @property
    def idna2008_status(self):
        if self.exception_value:
            return self.exception_value
        elif self.compat_value:
            return self.compat_value
        elif self.unassigned:
            return "UNASSIGNED"
        elif self.ldh:
            return "PVALID"
        elif self.join_control:
            return "CONTEXTJ"
        elif self.unstable:
            return "DISALLOWED"
        elif self.in_ignorableproperties:
            return "DISALLOWED"
        elif self.in_ignorableblocks:
            return "DISALLOWED"
        elif self.oldhanguljamo:
            return "DISALLOWED"
        elif self.in_lettersdigits:
            return "PVALID"
        else:
            return "DISALLOWED"

    @property
    def uts46_data(self):
        return self.ucdata.ucd_idnamt.get(self.value, None)

    @property
    def uts46_status(self):
        return " ".join(self.uts46_data)


def diagnose_codepoint(codepoint, args, ucdata):
    cp = CodePoint(codepoint, ucdata=ucdata)

    print("U+{:04X}:".format(codepoint))
    print("   Name:             {}".format(cp.name))
    print("1  Exceptions:       {}".format(exceptions.get(codepoint, False)))
    print("2  Backwards Compat: {}".format(backwardscompatible.get(codepoint, False)))
    print("3  Unassigned:       {}".format(cp.unassigned))
    print("4  LDH:              {}".format(cp.ldh))
    print("   Properties:       {}".format(" ".join(sorted(ucdata.ucd_props.get(codepoint, ["None"])))))
    print("5  .Join Control:    {}".format(cp.join_control))
    print("   NFKC CF:          {}".format(" ".join(["U+{:04X}".format(ord(x)) for x in cp.nfkc_cf])))
    print("6  .Unstable:        {}".format(cp.unstable))
    print("7  .Ignorable Prop:  {}".format(cp.in_ignorableproperties))
    print("   Block:            {}".format(ucdata.ucd_block.get(codepoint, None)))
    print("8  .Ignorable Block: {}".format(cp.in_ignorableblocks))
    print("   Hangul Syll Type: {}".format(ucdata.ucd_hst.get(codepoint, None)))
    print("9  .Old Hangul Jamo: {}".format(cp.oldhanguljamo))
    print("   General Category: {}".format(cp.general_category))
    print("10 .Letters Digits:  {}".format(cp.in_lettersdigits))
    print("== IDNA 2008:        {}".format(cp.idna2008_status))
    print("== UTS 46:           {}".format(cp.uts46_status))
    print("(Unicode {} [sys:{}])".format(ucdata.version, ucdata.system_version))


def ucdrange(start, end):
    if start == end:
        return ("{:04X}".format(start.value), start.name)
    else:
        return ("{:04X}..{:04X}".format(start.value, end.value), "{}..{}".format(start.name, end.name))


def upper_hex(value):
    num = hex(value)
    return num[:2] + num[2:].upper()


def optimised_list(d):
    values = intranges_from_list(d)
    if len(values) == 1:
        for value in values:
            # Respect ruff format style
            yield "({},),".format(upper_hex(value))
    else:
        yield "("
        for value in values:
            yield "        {},".format(upper_hex(value))
        yield "    ),"


def make_table(args, ucdata):
    last_status = None
    cps = []
    table_data = []

    for cp in ucdata.codepoints():
        status = cp.idna2008_status
        if last_status and last_status != status:
            (values, description) = ucdrange(cps[0], cps[-1])
            table_data.append([values, last_status, description])
            cps = []
        last_status = status
        cps.append(cp)
    (values, description) = ucdrange(cps[0], cps[-1])
    table_data.append([values, last_status, description])

    if args.dir:
        f = open("{}/idna-table-{}.txt".format(args.dir, ucdata.version), "wb")
        for row in table_data:
            f.write("{:12}; {:12}# {:.44}\n".format(*row).encode("ascii"))
        f.close()

    else:
        for row in table_data:
            print("{:12}; {:12}# {:.44}".format(*row))


def idna_libdata(ucdata):
    yield "# This file is automatically generated by tools/idna-data\n"
    yield '__version__ = "{}"\n'.format(ucdata.version)

    #
    # Script classifications are used by some CONTEXTO rules in RFC 5891
    #
    yield "scripts = {"
    for script in SCRIPT_WHITELIST:
        prefix = '    "{}": '.format(script)
        for line in optimised_list(ucdata.ucd_s[script]):
            yield prefix + line
            prefix = ""
    yield "}"

    #
    # Joining types are used by CONTEXTJ rule A.1
    #
    yield "joining_types = {"
    for cp in ucdata.codepoints():
        if cp.joining_type:
            yield "    0x{:X}: {},".format(cp.value, ord(cp.joining_type))
    yield "}"

    #
    # These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc.
    #
    yield "codepoint_classes = {"
    classes = {}
    for cp in ucdata.codepoints():
        status = cp.idna2008_status
        if status in ("UNASSIGNED", "DISALLOWED"):
            continue
        if not status in classes:
            classes[status] = set()
        classes[status].add(cp.value)
    for status in ["PVALID", "CONTEXTJ", "CONTEXTO"]:
        prefix = '    "{}": '.format(status)
        for line in optimised_list(classes[status]):
            yield prefix + line
            prefix = ""
    yield "}"


def uts46_ranges(ucdata):
    last = (None, None)
    for cp in ucdata.codepoints():
        fields = cp.uts46_data
        if not fields:
            continue
        status, mapping = UTS46_STATUSES[fields[0]]
        if mapping:
            mapping = "".join(chr(int(codepoint, 16)) for codepoint in fields[1].split())
            mapping = mapping.replace("\\", "\\\\")
        else:
            mapping = None
        if cp.value > 255 and (status, mapping) == last:
            continue
        last = (status, mapping)

        if mapping is not None:
            if '"' in mapping:
                yield "(0x{:X}, \"{}\", '{}')".format(cp.value, status, mapping)
            else:
                yield '(0x{:X}, "{}", "{}")'.format(cp.value, status, mapping)
        else:
            yield '(0x{:X}, "{}")'.format(cp.value, status)


def uts46_libdata(ucdata):
    yield "# This file is automatically generated by tools/idna-data"
    yield "# vim: set fileencoding=utf-8 :\n"
    yield "from typing import List, Tuple, Union\n"
    yield '"""IDNA Mapping Table from UTS46."""\n\n'

    yield '__version__ = "{}"\n'.format(ucdata.version)

    idx = -1
    for row in uts46_ranges(ucdata):
        idx += 1
        if idx % UTS46_SEGMENT_SIZE == 0:
            if idx != 0:
                yield "    ]\n"
            yield "\ndef _seg_{}() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]:\n    return [".format(
                idx // UTS46_SEGMENT_SIZE
            )
        yield "        {},".format(row)
    yield "    ]\n"

    yield "\nuts46data = tuple("
    yield "    _seg_0()"
    for i in range(1, idx // UTS46_SEGMENT_SIZE + 1):
        yield "    + _seg_{}()".format(i)
    yield ")  # type: Tuple[Union[Tuple[int, str], Tuple[int, str, str]], ...]"


def uts46_tests(ucdata):
    skip_tests = [
        # These appear to be errors in the test vectors so we skip building tests for
        # them from UTS46 data. All relate to incorrectly applying bidi rules across
        # label boundaries. Appears independently confirmed at
        # http://www.alvestrand.no/pipermail/idna-update/2017-January/007946.html
        "0\u00e0.\u05d0",
        "0a\u0300.\u05d0",
        "0A\u0300.\u05d0",
        "0\u00c0.\u05d0",
        "xn--0-sfa.xn--4db",
        "\u00e0\u02c7.\u05d0",
        "a\u0300\u02c7.\u05d0",
        "A\u0300\u02c7.\u05d0",
        "\u00c0\u02c7.\u05d0",
        "xn--0ca88g.xn--4db",
        "0A.\u05d0",
        "0a.\u05d0",
        "0a.xn--4db",
        "c.xn--0-eha.xn--4db",
        "c.0\u00fc.\u05d0",
        "c.0u\u0308.\u05d0",
        "C.0U\u0308.\u05d0",
        "C.0\u00dc.\u05d0",
        "C.0\u00fc.\u05d0",
        "C.0\u0075\u0308.\u05d0",
        "\u06b6\u06df\u3002\u2087\ua806",
        "\u06b6\u06df\u30027\ua806",
        "xn--pkb6f.xn--7-x93e",
        "\u06b6\u06df.7\ua806",
        "1.\uac7e6.\U00010c41\u06d0",
        "1.\u1100\u1165\u11b56.\U00010c41\u06d0",
        "1.xn--6-945e.xn--glb1794k",
        # Test vectors that expect an error when there is a trailing
        # dot.
        "a.b.c.d.",
        "xn--gl0as212a.i.",
        "繱𑖿.i.",
        "繱𑖿.I.",
        "xn--1ug6928ac48e.i.",
        "繱𑖿\u200d.i.",
        "繱𑖿\u200d.I.",
        "xn--9hb7344k.",
        "𐫇١.",
        "123456789012345678901234567890123456789012345678901234567890123.123456789012345678901234567890123456789012345678901234567890123.123456789012345678901234567890123456789012345678901234567890123.123456789012345678901234567890123456789012345678901234567890b.",
        "xn--rt6a.",
        "鱊.",
        "xn--4-0bd15808a.",
        "𞤺ߌ4.",
        "xn--hva754s.",
        "ⴖͦ.",
        "xn--ss-f4j.b.",
        "ss်.b.",
        "xn--gky8837e.",
        "璼𝨭.",
        "xn--7zv.",
        "梉.",
        "xn--ss-59d.",
        "ss۫.",
        "123456789012345678901234567890123456789012345678901234567890123.1234567890ä123456789012345678901234567890123456789012345.123456789012345678901234567890123456789012345678901234567890123.123456789012345678901234567890123456789012345678901234567890b.",
        "123456789012345678901234567890123456789012345678901234567890123.xn--1234567890123456789012345678901234567890123456789012345-kue.123456789012345678901234567890123456789012345678901234567890123.123456789012345678901234567890123456789012345678901234567890b.",
    ]

    yield "# This file is automatically generated by tools/idna-data\n"
    yield "import unittest\n"
    yield "import idna\n\n"
    yield "class UTS46Tests(unittest.TestCase):\n"

    for lineno, fields in ucdata.ucd_uts46tests:
        (
            source,
            to_unicode,
            to_unicode_status,
            to_ascii,
            to_ascii_status,
            to_ascii_t,
            to_ascii_t_status,
        ) = fields

        if source in skip_tests:
            continue

        # Per UTS46 test vectors, if the result is "", expect a blank string, but
        # if the result is blank, expect the input string.
        if to_unicode == '""':
            to_unicode = ""
        elif not to_unicode:
            to_unicode = source
        if not to_unicode_status:
            to_unicode_status = "[]"
        if to_ascii == '""':
            to_ascii = ""
        elif not to_ascii:
            to_ascii = to_unicode
        if not to_ascii_status:
            to_ascii_status = to_unicode_status
        if to_ascii_t == '""':
            to_ascii_t = ""
        elif not to_ascii_t:
            to_ascii_t = to_ascii
        if not to_ascii_t_status:
            to_ascii_t_status = to_ascii_status

        # Is this label IDNA 2008 legal according to UTS46 mapping table?
        nv8 = False
        for codepoint in to_unicode:
            try:
                field = ucdata.ucd_idnamt[ord(codepoint)][2]
            except IndexError:
                field = ""
            if field == "NV8" or field == "XV8":
                nv8 = ord(codepoint)

        yield "    def test_uts46_{}(self):".format(lineno)

        if to_unicode_status == "[]" and not nv8:
            yield (
                "        self.assertEqual(idna.decode({}, uts46=True, strict=True), {})".format(repr(source), repr(to_unicode))
            )
        else:
            yield ("        self.assertRaises(idna.IDNAError, idna.decode, {}, strict=True)".format(repr(source)))
        if to_ascii_status == "[]" and not nv8:
            yield (
                "        self.assertEqual(idna.encode({}, uts46=True, strict=True), b{})".format(repr(source), repr(to_ascii))
            )
        else:
            yield ("        self.assertRaises(idna.IDNAError, idna.encode, {}, strict=True)".format(repr(source)))
        yield ""


#        self.assertEqual(idna.uts46_remap("A_", std3_rules=False), "a_")
#        self.assertRaises(idna.InvalidCodepoint, idna.uts46_remap, "A_", std3_rules=True)


"""
    try:
            output = idna.decode(source, uts46=True, strict=True)
            if to_unicode_status != "[]":
                self.fail("decode() did not emit required error {} for {}".format(to_unicode, repr(source)))
            self.assertEqual(output, to_unicode, "unexpected decode() output")
        except (idna.IDNAError, UnicodeError, ValueError) as exc:
            if str(exc).startswith("Unknown"):
                raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
            if to_unicode_status == "[]":
                raise

        try:
            output = idna.encode(source, uts46=True, strict=True).decode("ascii")
            if to_ascii_status != "[]":
                self.fail("encode() did not emit required error {} for {}".format(to_ascii_status, repr(source)))
            self.assertEqual(output, to_ascii, "unexpected encode() output")
        except (idna.IDNAError, UnicodeError, ValueError) as exc:
            if str(exc).startswith("Unknown"):
                raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
            if to_ascii_status == "[]":
                raise

        try:
            output = idna.encode(source, uts46=True, strict=True, transitional=True).decode("ascii")
            if to_ascii_t_status != "[]":
                self.fail(
                    "encode(transitional=True) did not emit required error {} for {}".format(to_ascii_t_status, repr(source))
                )
            self.assertEqual(output, to_ascii_t, "unexpected encode() output")
        except (idna.IDNAError, UnicodeError, ValueError) as exc:
            if str(exc).startswith("Unknown"):
                raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
            if to_ascii_t_status == "[]":
                raise
"""


def make_libdata(args, ucdata):
    dest_dir = args.dir or "."

    target_filename = os.path.join(dest_dir, "idnadata.py")
    with open(target_filename, "wb") as target:
        for line in idna_libdata(ucdata):
            target.write((line + "\n").encode("utf-8"))

    target_filename = os.path.join(dest_dir, "uts46data.py")
    with open(target_filename, "wb") as target:
        for line in uts46_libdata(ucdata):
            target.write((line + "\n").encode("utf-8"))

    target_filename = os.path.join(dest_dir, "test_idna_uts46.py")
    with open(target_filename, "wb") as target:
        for line in uts46_tests(ucdata):
            target.write((line + "\n").encode("utf-8"))


def arg_error(message, parser):
    parser.print_usage()
    print("{}: error: {}".format(sys.argv[0], message))
    sys.exit(2)


def main():
    parser = argparse.ArgumentParser(description="Determine IDNA code-point validity data")
    parser.add_argument(
        "action", type=str, default="preferred", help="Task to perform (make-libdata, make-tables, <codepoint>)"
    )

    parser.add_argument("--version", type=str, default="preferred", help="Unicode version to use (preferred, latest, <x.y.z>)")
    parser.add_argument("--source", type=str, default=None, help="Where to fetch Unicode data (file path)")
    parser.add_argument("--dir", type=str, default=None, help="Where to export the output")
    parser.add_argument("--cache", type=str, default=None, help="Where to cache Unicode data")
    parser.add_argument("--no-cache", action="store_true", help="Don't cache Unicode data")
    libdata = parser.add_argument_group("make-libdata", "Make module data for Python IDNA library")

    tables = parser.add_argument_group("make-table", "Make IANA-style reference table")

    codepoint = parser.add_argument_group("codepoint", "Display related data for given codepoint (e.g. U+0061)")

    args = parser.parse_args()

    if args.version == "preferred":
        target_version = PREFERRED_VERSION
    else:
        target_version = args.version

    if args.cache and args.no_cache:
        arg_error("I can't both --cache and --no-cache", parser)
    cache = args.cache or DEFAULT_CACHE_DIR
    if args.no_cache:
        cache = None

    ucdata = UnicodeData(target_version, cache, args)

    if args.action == "make-table":
        make_table(args, ucdata)
    elif args.action == "make-libdata":
        make_libdata(args, ucdata)
    else:
        result = re.match(r"(?i)^(U\+|)(?P<cp>[0-9A-F]{4,6})$", args.action)
        if result:
            codepoint = int(result.group("cp"), 16)
            diagnose_codepoint(codepoint, args, ucdata)
            sys.exit(0)
        arg_error("Don't recognize action or codepoint value", parser)


if __name__ == "__main__":
    main()
