_cc = $utf_combining_class[$utf_char]; } $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char; } else { // Non-decomposable starter, check out if it's a Hangul syllable if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST) { // Nope, regular UTF char, check that we have the correct number of trailing bytes if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len]) { // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char // has been encoded in a five- or six- byte sequence. // Move the cursor back to its original position then advance it to the position it should really be at $pos -= $utf_len; $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } // Add a replacement char then another replacement char for every trailing byte. // // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos); $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1); $dump = $sort = 0; $pos += $spn; $tmp_pos = $pos; continue; } if (isset($extra_check[$utf_char[0]])) { switch ($utf_char[0]) { // Note: 0xED is quite common in Korean case "\xED": if ($utf_char >= "\xED\xA0\x80") { // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF) $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; // Note: 0xEF is quite common in Japanese case "\xEF": if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF") { // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF) $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; case "\xC0": case "\xC1": if ($utf_char <= "\xC1\xBF") { // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; case "\xE0": if ($utf_char <= "\xE0\x9F\xBF") { // Unicode char U+0000..U+07FF encoded in 3 bytes $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; case "\xF0": if ($utf_char <= "\xF0\x8F\xBF\xBF") { // Unicode char U+0000..U+FFFF encoded in 4 bytes $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; default: if ($utf_char > UTF8_MAX) { // Out of the Unicode range $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; } } } else { // Hangul syllabl