I’m reading a json file by python.
[
{
"id": 1,
"qas": [
{
"image_id": 1,
"qa_id": 10000000,
"answer": "グレー",
"a_objects": [],
"question": "右側の男性は何色の服を着ていますか?",
"q_objects": []
},
{
"image_id": 1,
"qa_id": 10000001,
"answer": "赤色",
"a_objects": [],
"question": "左側の男性は何色の服を着ていますか?",
"q_objects": []
},
...
#! /usr/bin/env python
# coding=utf-8
import json
import sys
infile="/home/c-nrong/VQA/draw/Json/question_answers_jan.json"
jfile = json.load(open(infile, 'r'))
#names_train=jfile["unique_img_train"]
newd = {}
for i in range(len(jfile)):
imgid=jfile[i]["id"]
print imgid
for j in range(len(jfile[i]["qas"])):
qid=jfile[i]["qas"][j]["qa_id"]
ques=jfile[i]["qas"][j]["question"]
print ques
The output is like
u'\u53f3\u5074\u306e\u7537\u6027\u306f\u4f55\u8272\u306e\u670d\u3092\u7740\u3066\u3044\u307e\u3059\u304b\uff1f'
add
#! /usr/bin/env python
# coding=utf-8
to the beginning of the file.
The output will be
右側の男性は何色の服を着ていますか?
test code
#! /usr/bin/env python
# coding=utf-8
import MeCab
import sys
def tokenize(sentence):
#print(sentence, type(sentence)) #unicode: (u'\u6a2a\uff1f')
#s=sentence.encode("utf-8")
#print(s,type(s)) #str: ('\xe6\x9f')
s=sentence
reslist=[]
mt = MeCab.Tagger("mecabrc")
res = mt.parseToNode(s)
res=res.next
while res:
#print(res.surface)
reslist.append(res.surface)
res = res.next
del reslist[-1]
#print(reslist[1],type(reslist[1])) #str: ('\xe6\x9f')
#for x in reslist: print x.decode("utf-8")
reslist_utf8=[ x.decode("utf-8") for x in reslist ]
return reslist
if __name__ == "__main__":
labels=["dare","doko","dona","dorekurai","dou","ikutu","itu","naze"]
path="quesID"
for i,label in enumerate(labels):
filei="%s/%s.quesID" %(path,label)
f = open(filei, 'r')
f_lines = f.readlines()
f.close()
for ix, line in enumerate(f_lines):
sen=line.strip('\n').split(':')[2]
print(sen)
res=tokenize(sen)
sys.exit()