package unicodeclass

import (
	"bufio"
	"strings"
	"unicode"
	"unicode/utf8"
)

type Class int

const (
	Invalid         Class = -1
	Blank           Class = 0
	Punctation      Class = 1
	Word            Class = 2
	Emoji           Class = 3
	SuperScript     Class = 0x2070
	SubScript       Class = 0x2080
	Braille         Class = 0x2800
	Hiragana        Class = 0x3040
	Katakana        Class = 0x30a0
	CJKIdeographs   Class = 0x4e00
	HungulSyllables Class = 0xac00
)

var classes = []struct {
	first rune
	last  rune
	value int
}{
	{0x037e, 0x037e, 1}, /* Greek question mark */
	{0x0387, 0x0387, 1}, /* Greek ano teleia */
	{0x055a, 0x055f, 1}, /* Armenian punctuation */
	{0x0589, 0x0589, 1}, /* Armenian full stop */
	{0x05be, 0x05be, 1},
	{0x05c0, 0x05c0, 1},
	{0x05c3, 0x05c3, 1},
	{0x05f3, 0x05f4, 1},
	{0x060c, 0x060c, 1},
	{0x061b, 0x061b, 1},
	{0x061f, 0x061f, 1},
	{0x066a, 0x066d, 1},
	{0x06d4, 0x06d4, 1},
	{0x0700, 0x070d, 1}, /* Syriac punctuation */
	{0x0964, 0x0965, 1},
	{0x0970, 0x0970, 1},
	{0x0df4, 0x0df4, 1},
	{0x0e4f, 0x0e4f, 1},
	{0x0e5a, 0x0e5b, 1},
	{0x0f04, 0x0f12, 1},
	{0x0f3a, 0x0f3d, 1},
	{0x0f85, 0x0f85, 1},
	{0x104a, 0x104f, 1}, /* Myanmar punctuation */
	{0x10fb, 0x10fb, 1}, /* Georgian punctuation */
	{0x1361, 0x1368, 1}, /* Ethiopic punctuation */
	{0x166d, 0x166e, 1}, /* Canadian Syl. punctuation */
	{0x1680, 0x1680, 0},
	{0x169b, 0x169c, 1},
	{0x16eb, 0x16ed, 1},
	{0x1735, 0x1736, 1},
	{0x17d4, 0x17dc, 1}, /* Khmer punctuation */
	{0x1800, 0x180a, 1}, /* Mongolian punctuation */
	{0x2000, 0x200b, 0}, /* spaces */
	{0x200c, 0x2027, 1}, /* punctuation and symbols */
	{0x2028, 0x2029, 0},
	{0x202a, 0x202e, 1}, /* punctuation and symbols */
	{0x202f, 0x202f, 0},
	{0x2030, 0x205e, 1}, /* punctuation and symbols */
	{0x205f, 0x205f, 0},
	{0x2060, 0x27ff, 1},      /* punctuation and symbols */
	{0x2070, 0x207f, 0x2070}, /* superscript */
	{0x2080, 0x2094, 0x2080}, /* subscript */
	{0x20a0, 0x27ff, 1},      /* all kinds of symbols */
	{0x2800, 0x28ff, 0x2800}, /* braille */
	{0x2900, 0x2998, 1},      /* arrows, brackets, etc. */
	{0x29d8, 0x29db, 1},
	{0x29fc, 0x29fd, 1},
	{0x2e00, 0x2e7f, 1}, /* supplemental punctuation */
	{0x3000, 0x3000, 0}, /* ideographic space */
	{0x3001, 0x3020, 1}, /* ideographic punctuation */
	{0x3030, 0x3030, 1},
	{0x303d, 0x303d, 1},
	{0x3040, 0x309f, 0x3040}, /* Hiragana */
	{0x30a0, 0x30ff, 0x30a0}, /* Katakana */
	{0x3300, 0x9fff, 0x4e00}, /* CJK Ideographs */
	{0xac00, 0xd7a3, 0xac00}, /* Hangul Syllables */
	{0xf900, 0xfaff, 0x4e00}, /* CJK Ideographs */
	{0xfd3e, 0xfd3f, 1},
	{0xfe30, 0xfe6b, 1},        /* punctuation forms */
	{0xff00, 0xff0f, 1},        /* half/fullwidth ASCII */
	{0xff1a, 0xff20, 1},        /* half/fullwidth ASCII */
	{0xff3b, 0xff40, 1},        /* half/fullwidth ASCII */
	{0xff5b, 0xff65, 1},        /* half/fullwidth ASCII */
	{0x20000, 0x2a6df, 0x4e00}, /* CJK Ideographs */
	{0x2a700, 0x2b73f, 0x4e00}, /* CJK Ideographs */
	{0x2b740, 0x2b81f, 0x4e00}, /* CJK Ideographs */
	{0x2f800, 0x2fa1f, 0x4e00}, /* CJK Ideographs */
}

var emoji = []struct {
	first rune
	last  rune
}{
	{0x203c, 0x203c},
	{0x2049, 0x2049},
	{0x2122, 0x2122},
	{0x2139, 0x2139},
	{0x2194, 0x2199},
	{0x21a9, 0x21aa},
	{0x231a, 0x231b},
	{0x2328, 0x2328},
	{0x23cf, 0x23cf},
	{0x23e9, 0x23f3},
	{0x24c2, 0x24c2},
	{0x25aa, 0x25ab},
	{0x25b6, 0x25b6},
	{0x25c0, 0x25c0},
	{0x25fb, 0x25fe},
	{0x2600, 0x2604},
	{0x260e, 0x260e},
	{0x2611, 0x2611},
	{0x2614, 0x2615},
	{0x2618, 0x2618},
	{0x261d, 0x261d},
	{0x2620, 0x2620},
	{0x2622, 0x2623},
	{0x2626, 0x2626},
	{0x262a, 0x262a},
	{0x262e, 0x262f},
	{0x2638, 0x263a},
	{0x2648, 0x2653},
	{0x2660, 0x2660},
	{0x2663, 0x2663},
	{0x2665, 0x2666},
	{0x2668, 0x2668},
	{0x267b, 0x267b},
	{0x267f, 0x267f},
	{0x2692, 0x2694},
	{0x2696, 0x2697},
	{0x2699, 0x2699},
	{0x269b, 0x269c},
	{0x26a0, 0x26a1},
	{0x26aa, 0x26ab},
	{0x26b0, 0x26b1},
	{0x26bd, 0x26be},
	{0x26c4, 0x26c5},
	{0x26c8, 0x26c8},
	{0x26ce, 0x26cf},
	{0x26d1, 0x26d1},
	{0x26d3, 0x26d4},
	{0x26e9, 0x26ea},
	{0x26f0, 0x26f5},
	{0x26f7, 0x26fa},
	{0x26fd, 0x26fd},
	{0x2702, 0x2702},
	{0x2705, 0x2705},
	{0x2708, 0x270d},
	{0x270f, 0x270f},
	{0x2712, 0x2712},
	{0x2714, 0x2714},
	{0x2716, 0x2716},
	{0x271d, 0x271d},
	{0x2721, 0x2721},
	{0x2728, 0x2728},
	{0x2733, 0x2734},
	{0x2744, 0x2744},
	{0x2747, 0x2747},
	{0x274c, 0x274c},
	{0x274e, 0x274e},
	{0x2753, 0x2755},
	{0x2757, 0x2757},
	{0x2763, 0x2764},
	{0x2795, 0x2797},
	{0x27a1, 0x27a1},
	{0x27b0, 0x27b0},
	{0x27bf, 0x27bf},
	{0x2934, 0x2935},
	{0x2b05, 0x2b07},
	{0x2b1b, 0x2b1c},
	{0x2b50, 0x2b50},
	{0x2b55, 0x2b55},
	{0x3030, 0x3030},
	{0x303d, 0x303d},
	{0x3297, 0x3297},
	{0x3299, 0x3299},
	{0x1f004, 0x1f004},
	{0x1f0cf, 0x1f0cf},
	{0x1f170, 0x1f171},
	{0x1f17e, 0x1f17f},
	{0x1f18e, 0x1f18e},
	{0x1f191, 0x1f19a},
	{0x1f1e6, 0x1f1ff},
	{0x1f201, 0x1f202},
	{0x1f21a, 0x1f21a},
	{0x1f22f, 0x1f22f},
	{0x1f232, 0x1f23a},
	{0x1f250, 0x1f251},
	{0x1f300, 0x1f320},
	{0x1f330, 0x1f335},
	{0x1f337, 0x1f37c},
	{0x1f380, 0x1f393},
	{0x1f3a0, 0x1f3c4},
	{0x1f3c6, 0x1f3ca},
	{0x1f3e0, 0x1f3f0},
	{0x1f400, 0x1f43e},
	{0x1f440, 0x1f440},
	{0x1f442, 0x1f4f7},
	{0x1f4f9, 0x1f4fc},
	{0x1f500, 0x1f53d},
	{0x1f550, 0x1f567},
	{0x1f5fb, 0x1f640},
	{0x1f645, 0x1f64f},
	{0x1f680, 0x1f6c5},
}

func Is(r rune) Class {
	if r < 0x100 {
		if r == ' ' || r == '\t' || r == 0 || r == 0xa0 {
			return 0 // blank
		}
		if !unicode.IsPunct(r) {
			return 2 // word
		}
		return 1 // punctation
	}

	for _, clazz := range classes {
		if clazz.first < r && r < clazz.last {
			return Class(clazz.value)
		}
	}
	for _, v := range emoji {
		if v.first < r && r < v.last {
			return 3 // emoji
		}
	}
	return 2 // word
}

func Split(s string) []string {
	scan := bufio.NewScanner(strings.NewReader(s))
	scan.Split(SplitClass)
	var words []string
	for scan.Scan() {
		words = append(words, scan.Text())
	}
	return words
}

func SplitClass(data []byte, atEOF bool) (int, []byte, error) {
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}
	bpos := 0
	b := data
	last := Invalid
	for {
		r, i := utf8.DecodeRune(b)
		if i == 0 {
			break
		}
		clazz := Is(r)
		if last == -1 {
			last = clazz
		} else if clazz != last {
			last = clazz
			break
		}
		bpos += i
		b = b[i:]
	}
	if !atEOF && !utf8.FullRune(b) {
		return 0, nil, nil
	}
	return bpos, data[:bpos], nil
}
