抓取百度新闻的时候出现 乱码现象

原因:Golang 默认不支持 UTF-8 以外的字符集
解决:将字符串的编码转换成UTF-8
需要用到的库
go get github.com/axgle/mahonia
1
2
|
go
get
github
.
com
/
axgle
/
mahonia
|
创建函数
# 常规方式 func ConvertToString(src string, srcCode string, tagCode string) string { srcCoder := mahonia.NewDecoder(srcCode) srcResult := srcCoder.ConvertString(src) tagCoder := mahonia.NewDecoder(tagCode) _, cdata, _ := tagCoder.Translate([]byte(srcResult), true) result := string(cdata) return result } # **这种方式更简洁** func decoderConvert(name string, body string) string { return mahonia.NewDecoder(name).ConvertString(body) }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
# 常规方式
func
ConvertToString
(
src
string
,
srcCode
string
,
tagCode
string
)
string
{
srcCoder
:
=
mahonia
.
NewDecoder
(
srcCode
)
srcResult
:
=
srcCoder
.
ConvertString
(
src
)
tagCoder
:
=
mahonia
.
NewDecoder
(
tagCode
)
_
,
cdata
,
_
:
=
tagCoder
.
Translate
(
[
]
byte
(
srcResult
)
,
true
)
result
:
=
string
(
cdata
)
return
result
}
# **这种方式更简洁**
func
decoderConvert
(
name
string
,
body
string
)
string
{
return
mahonia
.
NewDecoder
(
name
)
.
ConvertString
(
body
)
}
|
使用函数
func main() { url:= "http://top.baidu.com/news?fr=topbuzz_b4_c2" html := GetHtml(url,"pc") //Println(html) html = ConvertToString(html, "gbk", "utf-8") Println(html) ExtractData(html) //Println(html) }
1
2
3
4
5
6
7
8
9
10
|
func
main
(
)
{
url
:
=
"http://top.baidu.com/news?fr=topbuzz_b4_c2"
html
:
=
GetHtml
(
url
,
"pc"
)
/
/
Println
(
html
)
html
=
ConvertToString
(
html
,
"gbk"
,
"utf-8"
)
Println
(
html
)
ExtractData
(
html
)
/
/
Println
(
html
)
}
|
效果如下
