'From etoys2.2 of 1 October 2007 [latest update: #1710] on 16 October 2007 at 10:34:59 pm'! "Change Set: utf8toSqueakFix-yo Date: 16 October 2007 Author: Yoshiki Ohshima For obscure languages the character created should bear proper language tag."! !ByteString methodsFor: 'converting' stamp: 'yo 10/16/2007 22:18'! utf8ToSqueak "Convert the given string from UTF-8 using the fast path if converting to Latin-1" | outStream lastIndex nextIndex byte1 byte2 byte3 byte4 unicode | Latin1ToUtf8Map ifNil:[^super utf8ToSqueak]. "installation guard" lastIndex := 1. nextIndex := ByteString findFirstInString: self inSet: Latin1ToUtf8Map startingAt: lastIndex. nextIndex = 0 ifTrue:[^self]. outStream := (String new: self size) writeStream. [outStream next: nextIndex-lastIndex putAll: self startingAt: lastIndex. byte1 := self byteAt: nextIndex. (byte1 bitAnd: 16rE0) = 192 ifTrue: [ "two bytes" byte2 := self byteAt: (nextIndex := nextIndex+1). (byte2 bitAnd: 16rC0) = 16r80 ifFalse:[self error: 'Invalid UTF-8 input']. unicode := ((byte1 bitAnd: 31) bitShift: 6) + (byte2 bitAnd: 63)]. (byte1 bitAnd: 16rF0) = 224 ifTrue: [ "three bytes" byte2 := self byteAt: (nextIndex := nextIndex+1). (byte2 bitAnd: 16rC0) = 16r80 ifFalse:[self error: 'Invalid UTF-8 input']. byte3 := self byteAt: (nextIndex := nextIndex+1). (byte3 bitAnd: 16rC0) = 16r80 ifFalse:[self error: 'Invalid UTF-8 input']. unicode := ((byte1 bitAnd: 15) bitShift: 12) + ((byte2 bitAnd: 63) bitShift: 6) + (byte3 bitAnd: 63)]. (byte1 bitAnd: 16rF8) = 240 ifTrue: [ "four bytes" byte2 := self byteAt: (nextIndex := nextIndex+1). (byte2 bitAnd: 16rC0) = 16r80 ifFalse:[self error: 'Invalid UTF-8 input']. byte3 := self byteAt: (nextIndex := nextIndex+1). (byte3 bitAnd: 16rC0) = 16r80 ifFalse:[self error: 'Invalid UTF-8 input']. byte4 := self byteAt: (nextIndex := nextIndex+1). (byte4 bitAnd: 16rC0) = 16r80 ifFalse:[self error: 'Invalid UTF-8 input']. unicode := ((byte1 bitAnd: 16r7) bitShift: 18) + ((byte2 bitAnd: 63) bitShift: 12) + ((byte3 bitAnd: 63) bitShift: 6) + (byte4 bitAnd: 63)]. unicode ifNil:[self error: 'Invalid UTF-8 input']. outStream nextPut: (Unicode value: unicode). lastIndex := nextIndex + 1. nextIndex := ByteString findFirstInString: self inSet: Latin1ToUtf8Map startingAt: lastIndex. nextIndex = 0] whileFalse. outStream next: self size-lastIndex+1 putAll: self startingAt: lastIndex. ^outStream contents ! !