<?php

function is_valid_utf8($string) {
    // See http://us.php.net/manual/en/reference.pcre.pattern.modifiers.php
    // for the /u modifier trick
    // See http://en.wikipedia.org/wiki/UTF-8 for illegal characters
    return (/* we are declaring non-strings to be invalid, even if their string
	     * representation is valid */
	    is_string($string)
	    && (/* this catches the empty string */
		!$string
		|| (/* this is the /u modifier trick */
		    preg_match('/^.{1}/usS', $string)
		    /* theoretically preg_match does not fail on five or six
		     * octet sequences, but they are not displayable in any
		     * browser.  See the wikipedia page for why each of these
		     * characters is specifically illegal */
		    && (!preg_match("/[\xC0\xC1\xF5-\xFF]/S", $string)))));
}
// Some samples from http://us.php.net/pcre.pattern.modifiers
// as given by hfuecks at nospam dot org
$inputs = array(array(array('Valid ASCII' => "a"), true),
		array(array('Valid 2 Octet Sequence' => "\xc3\xb1"), true),
		array(array('Invalid 2 Octet Sequence' => "\xc3\x41"), false),
		array(array('Invalid Sequence Identifier' => "\xa0\xa1"), false),
		array(array('Valid 3 Octet Sequence' => "\xe2\x82\xa1"), true),
		array(array('Invalid 3 Octet Sequence (in 2nd Octet)' => "\xe2\x41\xa1"), false),
		array(array('Invalid 3 Octet Sequence (in 3rd Octet)' => "\xe2\x82\x41"), false),
		
		array(array('Valid 4 Octet Sequence' => "\xf0\x90\x8c\xbc"), true),
		array(array('Invalid 4 Octet Sequence (in 2nd Octet)' => "\xf0\x41\x8c\xbc"), false),
		array(array('Invalid 4 Octet Sequence (in 3rd Octet)' => "\xf0\x90\x41\xbc"), false),
		array(array('Invalid 4 Octet Sequence (in 4th Octet)' => "\xf0\x41\x8c\x41"), false),
		array(array('Valid 5 Octet Sequence (but not Unicode!)' => "\xf8\xa1\xa1\xa1\xa1"),
		      false),
		array(array('Valid 6 Octet Sequence (but not Unicode!)' => "\xfc\xa1\xa1\xa1\xa1\xa1"),
		      false),
		array(array('Invalid is php null' => null), false),
		array(array('Invalid is not string' => 0), false),
		array(array('0'), true),
		array(array(''), true),
		array(array(' '), true),
		array(array('0 '), true),
		array(array(' 0'), true),
		array(array('0 <script>alert();</script>'), true),
		array(array('Invalid starts with a zero, but ends with invalid bytes' => '0 ' . "\xf0\x41\x8c\xbc"), false),
		array(array('this is plain ASCII'), true),
		array(array('UTF8 Euro sign' => chr(226).chr(130).chr(172)), true),
		array(array('xxxx' . chr(254)), false));
echo '<ul>';
foreach ($inputs as $input) {
    $strings = $input[0];
    $expected = $input[1];
    foreach ($strings as $description => $string) {
	$got = is_valid_utf8($string);
	if ($got != $expected) {
	    echo '<li class="errors">';
	} else {
	    echo '<li>';
	}
	if ($description) {
	    echo $description;
	    echo '<br />';
	}
	echo ($got == $expected) ? 'RIGHT' : 'WRONG';
	echo ' ';
	echo ($got) ? '' : 'not ';
	echo 'valid';
	echo ': ';
	if ($got) {
	    echo "'";
	    echo htmlspecialchars($string);
	    echo "'";
	} else {
	    if (is_string($string)) {
		echo "'" . jm_dumpString($string) . "'";
	    } else {
		var_export($string);
	    }
	}
	echo '</li>';
    }
}
echo '</ul>';
jm_dump_php();

?>