Jump to content

Fun with mb strlen/code

From mediawiki.org
<?php

mb_internal_encoding('UTF-8');

function old_mb_strlen( $str ) {
	preg_match_all( '/(.)/us', $str, $matches );
	//return count($matches);
	return count($matches[1]);
}

function new_mb_strlen( $str ) {
	$counts = count_chars( $str );
	$total = 0;
	
	// Count ASCII bytes
	for( $i = 0; $i < 0x80; $i++ ) {
		$total += $counts[$i];
	}
	
	// Count multibyte sequence heads
	for( $i = 0xc0; $i < 0xff; $i++ ) {
		$total += $counts[$i];
	}
	return $total;
}

$benchme = array(
	'strlen',
	'mb_strlen',
	'old_mb_strlen',
	'new_mb_strlen' );

$testfiles = array(
	'washington.txt',
	'berlin.txt',
	'bulgakov.txt',
	'tokyo.txt',
	'young.txt' );

$rounds = 5;

foreach( $testfiles as $filename ) {
	$data = file_get_contents( $filename );
	print "Testing $filename:\n";
	foreach( $benchme as $function ) {
		$start = microtime( true );
		for( $i = 0; $i < $rounds; $i++ ) {
			$result = $function( $data );
		}
		$delta = ((microtime( true ) - $start) / $rounds) * 1000.0;
		printf( "%20s %10d chars %8.3fms\n", $function, $result, $delta );
	}
	print "\n";
}
?>