互联网时代的社会语言学:基于SNS的文本数据挖掘

本文探讨了互联网时代社会语言学的研究方法,通过SNS文本数据挖掘,定义了凝固度与自由度等概念,旨在分析网络语言特征及其形成规律。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

互联网时代的社会语言学:基于SNS的文本数据挖掘
本文转载于[url]http://www.matrix67.com/blog/archives/5044[/url]

[b][i]几个概念[/i][/b]

[b]
凝固度[/b]
我们定义“电影院”的凝合程度就是 p(电影院) 与 p(电) · p(影院) 比值和 p(电影院) 与 p(电影) · p(院) 的比值中的较小值,“的电影”的凝合程度则是 p(的电影) 分别除以 p(的) · p(电影) 和 p(的电) · p(影) 所得的商的较小值。

[b]自由度[/b]
我们不妨就把一个文本片段的自由运用程度定义为它的左邻字信息熵和右邻字信息熵中的较小值。

java实现,100M文本效果还可以,但大于100M以后内存会溢出



public class FindWordsByWordArray {

private final static ResourceBundle resourceBundle = ResourceBundle.getBundle("finder");

private Map<String, Word> wordsMap = new HashMap<String, Word>();

private int wordMaxLen = 5;
private double allTextLen = 0;
private double allDomSize = 0;
private double mutualInformationPunish = 0.5;
private double leftAndRightEntropyPunish = 1;
private double wholePunish = 10;

public FindWordsByWordArray() {
}

public FindWordsByWordArray(long num) {
this.allDomSize = num;
}

public static long pretreatment(File input, File output) throws IOException {

if(output.exists())
FileUtils.deleteQuietly(output);

LineIterator list = FileUtils.lineIterator(input, "utf-8");

List<String> res = new ArrayList<String>();

long num = 0;
for (String text = list.next(); list.hasNext(); text = list.next()) {

num++;

res.addAll(pretreatment(text));

if(res.size() > 500000) {
FileUtils.writeLines(output, res, true);
System.out.println("write lines 500000.");
res.clear();
}

}
list.close();

if(res.size() > 0)
FileUtils.writeLines(output, res, true);

System.out.println("pretreatment over.");
return num;
}

private static List<String> pretreatment(String... texts) {
List<String> res = new ArrayList<String>();

for(String text: texts) {
text = text.toLowerCase().replaceAll("\\d", "N")
.replaceAll("(\\p{P}|\\s+|&[a-zA-Z]*;|[a-zA-z]+://[^\\s]*|~|~|★)", "#")
.replace('.', '#')
.replace('+', '#')
.replace('|', '#')
.replace('>', '#');

for (String some : text.split("#")) {
if (some.length() < 5)
continue;
res.add(some);
}
}
return res;
}

public void parse(boolean needPretreatment, String... texts) {
if(needPretreatment) {
allDomSize += texts.length;
parse(false, pretreatment(texts));
return;
}
for (String text : texts) {
if (text.matches("^[a-zA-Z]*")) {
parseEnglish(text);
allTextLen += 1;
}else {
parseChinese(text);
allTextLen += text.length();
}
}
}

private void parseEnglish(String text) {
addEnglishWord(text);
}

private void parseChinese(String text) {

WordArray wordArray = new WordArray(text);
String left = null;
int thisWordMaxLen = wordMaxLen;

for (int index = 0, textLen = wordArray.wordLen(); index < textLen - 1; index++) {
for (int i = 2; i <= thisWordMaxLen; i++) {
int toIndex = index + i;
if (toIndex > textLen)
break;
String word = wordArray.subWords(index, toIndex);
addWord(word);
if (left != null)
wordsMap.get(word).leftAdd(left);
if (toIndex + 1 <= textLen)
wordsMap.get(word).rightAdd(wordArray.subWords(toIndex, toIndex + 1));
}
left = wordArray.subWords(index, index + 1);
}

for (String s : wordArray.getChineseWords()) {
addWord(s);
}
for (String s : wordArray.getEnglishWords()) {
addEnglishWord(s);
}
}

private void addWord(String word) {
if (word.length() == 0)
throw new IllegalArgumentException("word length is 0.");
if (wordsMap.containsKey(word))
wordsMap.get(word).getTf().incrementAndGet();
else
wordsMap.put(word, new Word(word));
}

private void addEnglishWord(String word) {
addWord(word);
wordsMap.get(word).setAllEnglish(true);
}

public void parse(boolean needPretreatment, Collection<String> texts) {
parse(needPretreatment, texts.toArray(new String[texts.size()]));
}

public List<String> print() {
return print(getRes());
}

public List<String> print(List<Word> words) {
List<String> res = new ArrayList<String>();
for (Word word : words) {
res.add(word.toTab());
}
return res;
}

public List<Word> getRes() {
List<Word> words = new ArrayList<Word>(wordsMap.values());
words = Lists.newArrayList(Collections2.filter(words, new Predicate<Word>() {
@Override
public boolean apply(Word word) {
return word.getConfidenceLevel() > 1;
}
}));
Collections.sort(words, new Comparator<Word>() {
@Override
public int compare(Word word1, Word word2) {
return word2.tf.get() - word1.tf.get();
}
});
return words;
}

class Word {

private String word;
private AtomicInteger tf;
private StringBuilder left;
private StringBuilder right;
private Double level = null;
private boolean isAllEnglish = false;

Word(String word) {
this.word = word;
this.tf = new AtomicInteger(1);
}

public String getWord() {
return word;
}

public AtomicInteger getTf() {
return tf;
}

public void leftAdd(String str) {
if(left == null)
this.left = new StringBuilder(3);
if(this.left.indexOf(str) < 0)
this.left.append(str);
}

public int getLeftNum() {
if(left == null)
return 0;
return new WordArray(left.toString()).wordLen();
}

public void rightAdd(String str) {
if(right == null)
this.right = new StringBuilder(3);
if(this.right.indexOf(str) < 0)
this.right.append(str);
}

public int getRightNum() {
if(right == null)
return 0;
return new WordArray(right.toString()).wordLen();
}

public void setAllEnglish(boolean allEnglish) {
isAllEnglish = allEnglish;
}

private Double getConfidenceLevel() {
if (this.level != null)
return this.level;

double allDomSize = FindWordsByWordArray.this.allDomSize;

if (this.getWord().replaceAll("N","").length() <= 1)
return 0d;
if (this.getTf().get() < allDomSize / 90)
return 0d;
double value;
if (!this.isAllEnglish) {

if (this.getLeftNum() < allDomSize / 190)
return 0d;
if (this.getRightNum() < allDomSize / 190)
return 0d;
if ((this.getRightNum() + this.getLeftNum()) < allDomSize / 90)
return 0d;
value = Double.MAX_VALUE;

WordArray wordArray = new WordArray(this.getWord());

for (int i = 1; i < wordArray.wordLen(); i++) {

int leftTf = wordsMap.get(wordArray.subWords(0, i)).getTf().get();

int rightTf = wordsMap.get(wordArray.subWords(i)).getTf().get();

double normal = leftTf * rightTf / (allTextLen * allTextLen);

double reality = this.getTf().get() * 2 / allTextLen;

value = reality / normal < value ? reality / normal : value;
}

int size = this.getLeftNum() > this.getRightNum() ?
this.getRightNum() : this.getLeftNum();

value = Math.pow(value, mutualInformationPunish) *
Math.pow(size, leftAndRightEntropyPunish)
/ wholePunish;
} else {
value = this.getTf().get() * 15 / allDomSize;
}
this.level = value;
return value;
}

@Override
public String toString() {
return "Word{" +
"word='" + word + '\'' +
", tf=" + tf +
", left=" + cutOff(left.toString(), 15) +
", right=" + cutOff(right.toString(), 15) +
'}';
}

public String toTab() {
return word + '\t' +
tf + '\t' +
level + '\t' +
getLeftNum() + '\t' +
getRightNum();
}

private String cutOff(String str, int max) {
if (str.length() > max)
str = str.substring(0, max) + "...]";
return "(" + new WordArray(str).wordLen() + ")" + str;
}
}

public void setWordMaxLen(int wordMaxLen) {
this.wordMaxLen = wordMaxLen;
}

public void setMutualInformationPunish(double mutualInformationPunish) {
this.mutualInformationPunish = mutualInformationPunish;
}

public void setLeftAndRightEntropyPunish(double leftAndRightEntropyPunish) {
this.leftAndRightEntropyPunish = leftAndRightEntropyPunish;
}

public void setWholePunish(double wholePunish) {
this.wholePunish = wholePunish;
}

public static void main(String[] args) throws IOException {

String inputPath = "e:/xiaoshuo.txt";
String outputPath = "e:/xiaoshuo_words";

// String inputPath = "e:/tweet/parse";
// String outputPath = "e:/tweet/words";

File inputFile = new File(inputPath);

if (inputFile.isFile()) {
File pretreatFile = new File("e:/xiaoshuo_p");
long domSize = pretreatment(new File(inputPath), pretreatFile);
System.out.println(domSize);
FindWordsByWordArray findWords = getFindWords(domSize);
LineIterator list = FileUtils.lineIterator(pretreatFile, "utf-8");
int i = 0;
for(String str = list.next(); list.hasNext(); str = list.next()){
findWords.parse(false, str);
if(i++ % 500000 == 0)
System.out.print(".");
}
list.close();
FileUtils.writeLines(new File(outputPath), findWords.print());
} else {
for (String inputFileName : inputFile.list()) {
FindWordsByWordArray findWords = getFindWords();
List<String> list = FileUtils.readLines(new File(inputPath, inputFileName), "utf-8");
findWords.parse(true, Lists.transform(list, new Function<String, String>() {
@Override
public String apply(String s) {
return s.substring(s.split("\t")[0].length());
}
}));
String outputFileName = inputFileName + "-words.";
if (inputFileName.split("\\.").length == 2)
outputFileName = inputFileName.split("\\.")[0] + "-words." +
inputFileName.split("\\.")[1];
List<String> printList = findWords.print();
if (printList.size() > 500)
printList = printList.subList(0, 500);
FileUtils.writeLines(new File(outputPath, outputFileName), printList);
}
}
}

private static FindWordsByWordArray getFindWords() {
return getFindWords(0);
}

private static FindWordsByWordArray getFindWords(long num) {
FindWordsByWordArray findWords = new FindWordsByWordArray(num);
findWords.setWordMaxLen(Integer.parseInt(resourceBundle.getString("word.max.len")));
findWords.setMutualInformationPunish(
Double.parseDouble(resourceBundle.getString("mutual.information.punish")));
findWords.setLeftAndRightEntropyPunish(
Double.parseDouble(resourceBundle.getString("left.and.right.entropy.punish")));
findWords.setWholePunish(Double.parseDouble(resourceBundle.getString("whole.punish")));
return findWords;
}



public class WordArray {

private String someWord;
private List<int[]> enIndexAndLen = null;

public WordArray(String someWord) {
this.someWord = someWord;
char[] chars = someWord.toCharArray();
for(int i = 0, charsLen = chars.length; i<charsLen; i++) {
if(CharUtils.isEnglish(chars[i])) {
int index = i;
while (++i < charsLen && CharUtils.isEnglish(chars[i]));
if(enIndexAndLen == null)
enIndexAndLen = new ArrayList<int[]>();
enIndexAndLen.add(new int[]{index, i - index});
}
}
}

public String subWords(int beginIndex, int endIndex) {
int realityBeginIndex = beginIndex;
int realityEndIndex = endIndex;
if(enIndexAndLen != null) {
for(int[] intArray: enIndexAndLen) {
if(intArray[0] < realityBeginIndex) {
realityBeginIndex += intArray[1] -1;
}
if(intArray[0] < realityEndIndex) {
realityEndIndex += intArray[1] - 1;
}
}
}
return someWord.substring(realityBeginIndex, realityEndIndex);
}

public String subWords(int beginIndex) {
return subWords(beginIndex, wordLen());
}

public int wordLen() {
int len = someWord.length();
if(enIndexAndLen != null)
for(int[] intArray: enIndexAndLen)
len -= (intArray[1] - 1);
return len;
}

public String[] getEnglishWords() {
if(enIndexAndLen != null) {
String[] strings = new String[enIndexAndLen.size()];
int i = 0;
for(int[] intArray: enIndexAndLen)
strings[i++] = someWord.substring(intArray[0], intArray[0]+intArray[1]);
return strings;
}else{
return new String[0];
}
}

public List<String> getChineseWords() {
List<String> strings = new ArrayList<String>();
for (char c : someWord.toCharArray()) {
if(CharUtils.isEnglish(c))
continue;
strings.add(String.valueOf(c));
}
return strings;
}

public static void main(String[] args) {
WordArray wordArray = new WordArray("我爱Style江南的music哈");
System.out.println(wordArray.subWords(0, 5));
System.out.println(wordArray.subWords(5, wordArray.wordLen()));
System.out.println(wordArray.wordLen());
System.out.println(Arrays.toString(wordArray.getEnglishWords()));
}

}




#(int)[2-n default=10] word max len.
word.max.len = 5

#(double)[1.0-0.0 default=0.3] MutualInformation punish.
mutual.information.punish = 0.5

#(double)[1.0-0.0 default=1.0] LeftAndRightEntropy punish.
left.and.right.entropy.punish = 1

#(double)[1-n default=10] WholePunish punish.
whole.punish = 10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值