class Unicode : precompile {
use IntList;
native static method uchar : int ($str : string, $offset_ref : int*);
native static method uchar_to_utf8 : string ($uchar : int);
enum {
ERROR_INVALID_UTF8 = -2,
}
static method is_unicode_scalar_value : int ($code_point: int) {
my $is_unicode_scalar_value = 0;
# The range of Unicde code points
if ($code_point >= 0 && $code_point <= 0x10FFFF) {
# Not surrogate code points
unless ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
$is_unicode_scalar_value = 1;
}
}
return $is_unicode_scalar_value;
}
static method utf8_to_utf16 : short[] ($str : string) {
my $pos = 0;
my $uchars_list = IntList->new_len(0);
while ((my $uchar = &uchar($str, \$pos)) >= 0) {
$uchars_list->push($uchar);
}
unless ($pos >= length $str) {
die "Can't convert part of string to utf16";
}
my $uchars = $uchars_list->to_array;
my $utf16_chars = &utf32_to_utf16($uchars);
return $utf16_chars;
}
static method utf16_to_utf8 : string ($utf16_chars : short[]) {
my $uchars = &utf16_to_utf32($utf16_chars);
my $buffer = StringBuffer->new;
for (my $i = 0; $i < @$uchars; $i++) {
my $uchar = $uchars->[$i];
my $utf8_str = &uchar_to_utf8($uchar);
$buffer->push($utf8_str);
}
my $str = $buffer->to_string;
return $str;
}
static method utf32_to_utf16 : short[] ($code_point_string : int[]) {
my $length = 0;
# Culcurate length
for (my $i = 0; $i < @$code_point_string; $i++) {
my $code_point = $code_point_string->[$i];
if ($code_point < 0 || $code_point > 0x10FFFF) {
die "Invalid code point in code point string";
}
if ($code_point < 0x10000) {
$length++;
}
else {
$length += 2;
}
}
# Convert code point to UTF-16
my $utf16_string = new short[$length];
my $pos = 0;
for (my $i = 0; $i < @$code_point_string; $i++) {
my $code_point = $code_point_string->[$i];
if ($code_point < 0x10000) {
$utf16_string->[$pos] = (short)$code_point;
$pos++;
}
else {
$utf16_string->[$pos] = (short)(($code_point - 0x10000) / 0x400 + 0xD800);
$utf16_string->[$pos + 1] = (short)(($code_point - 0x10000) % 0x400 + 0xDC00);
$pos += 2;
}
}
return $utf16_string;
}
static method utf16_to_utf32 : int[] ($utf16_string : short[]) {
my $length = 0;
# Culcurate length
for (my $i = 0; $i < @$utf16_string; $i++) {
if (&_is_utf16_high_surrogate($utf16_string->[$i] & 0xFFFF)) {
if ($i + 1 == @$utf16_string - 1) {
die "Invalid UTF-16 string";
}
else {
$i++;
unless (&_is_utf16_low_surrogate($utf16_string->[$i] & 0xFFFF)) {
die "Invalid UTF-16 string";
}
}
}
elsif (&_is_utf16_low_surrogate($utf16_string->[$i] & 0xFFFF)) {
die "Invalid UTF-16 string";
}
$length++;
}
# Convert UTF-16 to code point
my $code_point_string = new int[$length];
my $pos = 0;
for (my $i = 0; $i < @$utf16_string; $i++) {
if (&_is_utf16_high_surrogate($utf16_string->[$i] & 0xFFFF)) {
$code_point_string->[$pos] = 0x10000 + (($utf16_string->[$i] & 0xFFFF) - 0xD800) * 0x400 + (($utf16_string->[$i + 1] & 0xFFFF) - 0xDC00);
$i++;
}
else {
$code_point_string->[$pos] = $utf16_string->[$i] & 0xFFFF;
}
$pos++;
}
return $code_point_string;
}
private static method _is_utf16_high_surrogate : int ($ch : int) {
if ($ch >= 0xD800 && $ch < 0xDC00) {
return 1;
}
else {
return 0;
}
}
private static method _is_utf16_low_surrogate : int ($ch : int) {
if ($ch >= 0xDC00 && $ch < 0xE000) {
return 1;
}
else {
return 0;
}
}
}