#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 2015-1-26
@author: beyondzhou
@name: nltk_extract_entity.py
'''
import json
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# Download nltk packages used in this example
#nltk.download('stopwords')
# Read data
BLOG_DATA = r"E:\eclipse\Web\dFile\feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
sentences = sent_tokenize(post['content'])
tokens = [word_tokenize(s) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
# Flatten the list since we're not using sentence structure
# and sentences are guaranteed to be separated by a special
# POS tuple such as ('.', '.')
pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
for (token, pos) in pos_tagged_tokens:
if pos == previous_pos and pos.startswith('NN'):
current_entity_chunk.append(token)
elif pos.startswith('NN'):
if current_entity_chunk != []:
# Note that current_entity_chunk could be a duplicate when appended,
# so frequency analysis again becomes a consideration
all_entity_chunks.append((' '.join(current_entity_chunk), pos))
previous_pos = pos
# Store the chunks as an index for the document
# and account for frequency while we're at it...
post['entities'] = {}
for c in all_entity_chunks:
post['entities'][c] = post['entities'].get(c, 0) + 1
# For example, we could display just the title-cased entities
print post['title']
print '-' * len(post['title'])
proper_nouns = []
for (entity, pos) in post['entities']:
if entity.istitle():
print '\t%s (%s)' % (entity, post['entities'][(entity, pos)])
print
Four short links: 23 January 2015
---------------------------------
Designing on a system level
---------------------------
Goodman (2)
Bitcoin is just the first app to use blockchain technology
----------------------------------------------------------
Lantz (1)
Lantz Summit Francisco (2)
Lantz Summit (4)
Lantz (1)
Blockchain scalability
----------------------
Buterin (1)
Buterin James-Lubin Summit (4)
Buterin James-Lubin (1)
Buterin James-Lubin (1)
Buterin (1)
Buterin (1)
Buterin James-Lubin Summit Francisco (2)
Bringing an end to synthetic biology’s semantic debate
------------------------------------------------------------
Gardner Biology Working Group Scientific Committees (3)
Gardner (2)
Gardner (1)
Gardner Biology Working Group (1)
Building and deploying large-scale machine learning pipelines
-------------------------------------------------------------
Four short links: 22 January 2015
---------------------------------
How to make a UX designer
-------------------------
Wydeven (12)
Wydeven (8)
Wydeven (21)
The 3Ps of the blockchain: platforms, programs and protocols
------------------------------------------------------------
Four short links: 21 January 2015
---------------------------------
The Internet of Things is really about software
-----------------------------------------------
What containers can do for you
------------------------------
Four short links: 20 January 2015
---------------------------------
Webb Joining British Govt Data Service — (1)
Webb Joining British Govt Data Service — (8)
Striking parallels between mathematics and software engineering
---------------------------------------------------------------
Zheng (2)
Four short links: 19 January 2015
---------------------------------
Simpson ) — It (4)
Simpson ) — It (2)