unit utf8;

interface

uses windows,sysutils;

function AnsiStringToUtf8(const AnsiString:string):string;
function Utf8StringToAnsi(const Utf8String:string):string;

function ConvertUTF16toUTF8(sourceStart:pwidechar;sourceEnd:pwidechar;
	targetStart:pchar;targetEnd:pchar):integer;

function ConvertUTF8toUTF16(sourceStart:pchar;sourceEnd:pchar;
	targetStart:pwidechar;targetEnd:pwidechar;sourceFindPtr:pchar;var targetFoundPtr:pwidechar):integer;

implementation

const offsetsFromUTF8:array[0..5] of longint = ($00000000, $00003080,
         $000E2080, $03C82080, $FA082080, $82082080);
const bytesFromUTF8:array[char] of byte = (
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5);

const firstByteMark:array[0..6] of byte = ($00, $00, $C0, $E0, $F0, $F8, $FC);

const conv_sourceExhausted=-1;

const kReplacementCharacter=#$FFFD;
      kReplacementCharacterL=$FFFD;
      kMaximumUCS2 =$0000FFFF;
      kMaximumUTF16 =$0010FFFF;
      kMaximumUCS4 =$7FFFFFFF;
      halfBase=$0010000;
      halfShift=10;
      kSurrogateHighStart	= $D800;
      kSurrogateHighEnd	= $DBFF;
      halfMask=$3FF;
      kSurrogateLowStart=$DC00;
      kSurrogateLowEnd=$DFFF;

const byteMask = $BF;
      byteMark = $80;

function ConvertUTF16toUTF8(sourceStart:pwidechar;sourceEnd:pwidechar;
	targetStart:pchar;targetEnd:pchar):integer;
var source:pwidechar;
    bytesToWrite:integer;
    target:pchar;
    lch,lch2:longint;
label l1,l2,l3,l4,l5;
begin
	source:=sourceStart;
	target:=targetStart;
	while (source < sourceEnd) do begin
	  lch := word(source[0]);
      inc(source);
      if (lch >= kSurrogateHighStart) and (lch <= kSurrogateHighEnd) and
        (source < sourceEnd) then begin
        lch2 := word(source[0]);
        if (lch2 >= kSurrogateLowStart) and (lch2 <= kSurrogateLowEnd) then begin
          lch := ((lch - kSurrogateHighStart) shl halfShift)
                + (lch2 - kSurrogateLowStart) + halfBase;
          inc(source);
        end;
      end;
      if lch<0 then begin
        lch := kReplacementCharacterL;
        bytesToWrite := 2;
      end else if (lch < $80) then				bytesToWrite := 1
		else if (lch < $800) then		bytesToWrite := 2
		else if (lch < $10000) then		bytesToWrite := 3
		else if (lch < $200000) then		bytesToWrite := 4
		else if (lch < $4000000) then	bytesToWrite := 5
		else bytesToWrite := 6;
		inc(target,bytesToWrite);
		if (target > targetEnd) then begin
			dec(target,bytesToWrite);
      break;
    end;
(*		switch (bytesToWrite) {	/* note: code falls through cases! */
			case 6:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
			case 5:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
			case 4:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
			case 3:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
			case 2:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
			case 1:	*--target =  ch | firstByteMark[bytesToWrite];
		};*)
    if bytestowrite<6 then goto l5;
    dec(target);
    target[0]:=char((lch or byteMark) and byteMask);
    lch:=lch shr 6;
l5: if bytestowrite<5 then goto l4;
    dec(target);
    target[0]:=char((lch or byteMark) and byteMask);
    lch:=lch shr 6;
l4: if bytestowrite<4 then goto l3;
    dec(target);
    target[0]:=char((lch or byteMark) and byteMask);
    lch:=lch shr 6;
l3: if bytestowrite<3 then goto l2;
    dec(target);
    target[0]:=char((lch or byteMark) and byteMask);
    lch:=lch shr 6;
l2: if bytestowrite<2 then goto l1;
    dec(target);
    target[0]:=char((lch or byteMark) and byteMask);
    lch:=lch shr 6;
l1: dec(target);
    target[0]:=char(lch or firstByteMark[bytesToWrite]);
    inc(target,bytesToWrite);
  end;
  result:=target-targetstart;
end;

function GetUtf8CharWidth(firstchar:char):integer;
begin
  result:=bytesFromUTF8[firstchar]+1;
end;

function IsFirstUTF8Char(thechar:char):boolean;
{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
begin
  result:=(byte(thechar) and (128+64))<>128;
end;

function ConvertUTF8toUTF16(sourceStart:pchar;sourceEnd:pchar;
	targetStart:pwidechar;targetEnd:pwidechar;sourceFindPtr:pchar;var targetFoundPtr:pwidechar):integer;
var source:pchar;
    extrabytestowrite:integer;
    target:pwidechar;
    lch:longint;
begin
 source := sourceStart;
 target := targetStart;
 result := 0;
 while (source < sourceEnd) do begin
  	lch:=0;
	extraBytesToWrite:=bytesFromUTF8[source[0]];
    if (source + extraBytesToWrite > sourceEnd) then begin
    	result:=conv_sourceExhausted;
      break;
    end;
    if sourceFindPtr<>nil then  {Find a pointer in the target corresponding to one in the source}
      if source>=sourceFindPtr then begin
        targetFoundPtr:=target;
        sourceFindPtr:=nil;
      end;
    asm
      push edi
      mov edi,source
      xor eax,eax
      mov edx,extrabytestowrite
      mov al,[edi]
      or edx,edx
      jz @@done     {Spezialfall: 0 Zusatzbytes}
      xor ecx,ecx
    @@again:
      inc edi
      shl eax,6
      mov cl,[edi]
      add eax,ecx
      dec edx
      jnz @@again
    @@done:
      pop edi
      mov lch,eax
    end;
    inc(source,extrabytestowrite+1);
    dec(lch,offsetsFromUTF8[extraBytesToWrite]);

    if (target >= targetEnd) then begin
     	//result := conv_targetExhausted;
      break;
    end;
    if (lch <= kMaximumUCS2) and (lch>=0) then begin
      word(target[0]) := lch;
      inc(target);
	end else if (lch > kMaximumUTF16) or (lch<0) then begin
	  target[0] := kReplacementCharacter;
      inc(target);
    end else begin
	  if (target + 1 >= targetEnd) then begin
//      	result := targetExhausted;
        break;
      end;
      dec(lch,halfBase);
      target[0]:= Widechar((lch shr halfShift) + kSurrogateHighStart);
	  target[1]:= Widechar((lch and halfMask) + kSurrogateLowStart);
      inc(target,2);
    end;
  end;
  if result<>conv_sourceExhausted then
    result:=target-targetstart;
end;

function wstrlen(wp:pwidechar):integer;
begin
  if wp=nil then result:=0
  else begin
    result:=0;
    while wp[result]<>#0 do
      inc(result);
  end;
end;

function AnsiStringToUtf8(const AnsiString:string):string;
var StringW:array[0..1023] of widechar;
    utf8str:array[0..2047] of char;
begin
  MultiBytetoWideChar(CP_ACP, 0, pchar(AnsiString), -1, StringW, sizeof(StringW) div 2-1);
  ConvertUTF16toUTF8(StringW,StringW+wstrlen(StringW)+1,
    utf8str,utf8str+sizeof(utf8str)-1);
  result:=strpas(utf8str);
end;

function Utf8StringToAnsi(const Utf8String:string):string;
var StringW:array[0..1023] of widechar;
    ansistr:array[0..2047] of char;
    unusedptr:pwidechar;
begin
  ConvertUTF8toUTF16(pchar(Utf8String),pchar(Utf8String)+length(Utf8String)+1,
    StringW,StringW+sizeof(StringW) div 2-1,nil,unusedptr);
  WideCharToMultiByte(CP_ACP,0,StringW,-1,
    ansistr,sizeof(ansistr)-1,nil,nil);
  result:=strpas(ansistr);
end;


end.
