- Valid ASCII
RIGHT valid: 'a' - Valid 2 Octet Sequence
RIGHT valid: 'ñ' - Invalid 2 Octet Sequence
RIGHT not valid: '\195A' - Invalid Sequence Identifier
RIGHT not valid: '\160\161' - Valid 3 Octet Sequence
RIGHT valid: '₡' - Invalid 3 Octet Sequence (in 2nd Octet)
RIGHT not valid: '\226A\161' - Invalid 3 Octet Sequence (in 3rd Octet)
RIGHT not valid: '\226\130A' - Valid 4 Octet Sequence
RIGHT valid: '𐌼' - Invalid 4 Octet Sequence (in 2nd Octet)
RIGHT not valid: '\240A\140\188' - Invalid 4 Octet Sequence (in 3rd Octet)
RIGHT not valid: '\240\144A\188' - Invalid 4 Octet Sequence (in 4th Octet)
RIGHT not valid: '\240A\140A' - Valid 5 Octet Sequence (but not Unicode!)
RIGHT not valid: '\248\161\161\161\161' - Valid 6 Octet Sequence (but not Unicode!)
RIGHT not valid: '\252\161\161\161\161\161' - Invalid is php null
RIGHT not valid: NULL - Invalid is not string
RIGHT not valid: 0 - RIGHT valid: '0'
- RIGHT valid: ''
- RIGHT valid: ' '
- RIGHT valid: '0 '
- RIGHT valid: ' 0'
- RIGHT valid: '0 <script>alert();</script>'
- Invalid starts with a zero, but ends with invalid bytes
RIGHT not valid: '0 \240A\140\188' - RIGHT valid: 'this is plain ASCII'
- UTF8 Euro sign
RIGHT valid: '€' - RIGHT not valid: 'xxxx\254'
<?php
function is_valid_utf8($string) {
// See http://us.php.net/manual/en/reference.pcre.pattern.modifiers.php
// for the /u modifier trick
// See http://en.wikipedia.org/wiki/UTF-8 for illegal characters
return (/* we are declaring non-strings to be invalid, even if their string
* representation is valid */
is_string($string)
&& (/* this catches the empty string */
!$string
|| (/* this is the /u modifier trick */
preg_match('/^.{1}/usS', $string)
/* theoretically preg_match does not fail on five or six
* octet sequences, but they are not displayable in any
* browser. See the wikipedia page for why each of these
* characters is specifically illegal */
&& (!preg_match("/[\xC0\xC1\xF5-\xFF]/S", $string)))));
}
// Some samples from http://us.php.net/pcre.pattern.modifiers
// as given by hfuecks at nospam dot org
$inputs = array(array(array('Valid ASCII' => "a"), true),
array(array('Valid 2 Octet Sequence' => "\xc3\xb1"), true),
array(array('Invalid 2 Octet Sequence' => "\xc3\x41"), false),
array(array('Invalid Sequence Identifier' => "\xa0\xa1"), false),
array(array('Valid 3 Octet Sequence' => "\xe2\x82\xa1"), true),
array(array('Invalid 3 Octet Sequence (in 2nd Octet)' => "\xe2\x41\xa1"), false),
array(array('Invalid 3 Octet Sequence (in 3rd Octet)' => "\xe2\x82\x41"), false),
array(array('Valid 4 Octet Sequence' => "\xf0\x90\x8c\xbc"), true),
array(array('Invalid 4 Octet Sequence (in 2nd Octet)' => "\xf0\x41\x8c\xbc"), false),
array(array('Invalid 4 Octet Sequence (in 3rd Octet)' => "\xf0\x90\x41\xbc"), false),
array(array('Invalid 4 Octet Sequence (in 4th Octet)' => "\xf0\x41\x8c\x41"), false),
array(array('Valid 5 Octet Sequence (but not Unicode!)' => "\xf8\xa1\xa1\xa1\xa1"),
false),
array(array('Valid 6 Octet Sequence (but not Unicode!)' => "\xfc\xa1\xa1\xa1\xa1\xa1"),
false),
array(array('Invalid is php null' => null), false),
array(array('Invalid is not string' => 0), false),
array(array('0'), true),
array(array(''), true),
array(array(' '), true),
array(array('0 '), true),
array(array(' 0'), true),
array(array('0 <script>alert();</script>'), true),
array(array('Invalid starts with a zero, but ends with invalid bytes' => '0 ' . "\xf0\x41\x8c\xbc"), false),
array(array('this is plain ASCII'), true),
array(array('UTF8 Euro sign' => chr(226).chr(130).chr(172)), true),
array(array('xxxx' . chr(254)), false));
echo '<ul>';
foreach ($inputs as $input) {
$strings = $input[0];
$expected = $input[1];
foreach ($strings as $description => $string) {
$got = is_valid_utf8($string);
if ($got != $expected) {
echo '<li class="errors">';
} else {
echo '<li>';
}
if ($description) {
echo $description;
echo '<br />';
}
echo ($got == $expected) ? 'RIGHT' : 'WRONG';
echo ' ';
echo ($got) ? '' : 'not ';
echo 'valid';
echo ': ';
if ($got) {
echo "'";
echo htmlspecialchars($string);
echo "'";
} else {
if (is_string($string)) {
echo "'" . jm_dumpString($string) . "'";
} else {
var_export($string);
}
}
echo '</li>';
}
}
echo '</ul>';
jm_dump_php();
?>