utf8 包用到的常量定义如下:
const (
RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte.
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
)
// Code points in the surrogate range are not valid for UTF-8.
const (
surrogateMin = 0xD800
surrogateMax = 0xDFFF
)
const (
t1 = 0x00 // 0000 0000
tx = 0x80 // 1000 0000
t2 = 0xC0 // 1100 0000
t3 = 0xE0 // 1110 0000
t4 = 0xF0 // 1111 0000
t5 = 0xF8 // 1111 1000
maskx = 0x3F // 0011 1111
mask2 = 0x1F // 0001 1111
mask3 = 0x0F // 0000 1111
mask4 = 0x07 // 0000 0111
rune1Max = 1<<7 - 1
rune2Max = 1<<11 - 1
rune3Max = 1<<16 - 1
)
导出函数解读:
1、DecodeLastRune(p []byte)
a、 函数功能说明:返回p中最后一个rune和它占用的字节数,若果p最后一个rune是非法字符的话,返回\uFFFD 和0 .
【其中\uFFFD表示:当从某语言向Unicode转化时,如果在某语言中没有该字符,得到的将是Unicode的代码“\uffffd”(“\u”表示是Unicode编码,)】
b、 函数源码:
func DecodeLastRune(p []byte) (r rune, size int) {
end := len(p)
if end == 0 {
return RuneError, 0
}
start := end - 1
r = rune(p[start])
if r < RuneSelf {
return r, 1
}
// guard against O(n^2) behavior when traversing
// backwards through strings with long sequences of
// invalid UTF-8.
lim := end - UTFMax
if lim < 0 {
lim = 0
}
for start--; start >= lim; start-- {
if RuneStart(p[start]) {
break
}
}
if start < 0 {
start = 0
}
r, size = DecodeRune(p[start:end])
if start+size != end {
return RuneError, 1
}
return r, size
}