Zipf定律大致可以表述为如果将一篇文献中的单词按照出现的频率从大到小进行排序构成频率表,则一个单词出现的频率与其在该频率表中的排名成反比。该定律在已经广泛应用在语言学、经济学、地理学以及信息科学等领域,并在多种语种词频统计中得到验证。
1.代码源码为:
import java.io.Serializable;
import java.util.NavigableMap;
import java.util.Random;
import java.util.TreeMap;
public class Zifp_gen implements Serializable {
private Random random = new Random(0);
NavigableMap<Double, Integer> map;
private static final double Constant = 1.0;
public Zifp_gen(int nums, double skewness) {
// create the TreeMap
map = computeMap(nums, skewness);
}
//size为rank个数,skew为数据倾斜程度, 取值为0表示数据无倾斜,取值越大倾斜程度越高
private static NavigableMap<Double, Integer> computeMap(
int size, double skew) {
NavigableMap<Double, Integer> map =
new TreeMap<Double, Integer>();
//总频率
double div = 0;
//对每个rank,计算对应的词频,计算总词频
for (int i = 1; i <= size; i++) {
//the frequency in position i
div += (Constant / Math.pow(i, skew));
}
//计算每个rank对应的y值,所以靠前rank的y值区间远比后面rank的y值区间大
double sum = 0;
for (int i = 1; i <= size; i++) {
double p = (Constant / Math.pow(i, skew)) / div;
sum += p;
map.put(sum, i - 1);
}
return map;
}
// public int next() { // [1,n]
// double value = random.nextDouble();
// //找最近y值对应的rank
// return map.ceilingEntry(value).getValue() + 1;
// }
}
2.单元测试
import java.util.NavigableMap;
public class Test {
public static void main(String args[]){
Zifp_gen z1=new Zifp_gen(100,1.0);
for (NavigableMap.Entry<Double, Integer> entry : z1.map.entrySet()) {
System.out.println("Key = " + entry.getKey() + ", Value = " + entry.getValue());
}
}
}
3.写出结果到文件中:
public class Test {
public static void main(String args[]) throws IOException{
Zifp_gen z1=new Zifp_gen(100,0.5);
PrintWriter pw=new PrintWriter(new FileWriter("F:\\zipf_100_0.5.txt"));
for (NavigableMap.Entry<Double, Integer> entry : z1.map.entrySet()) {
// System.out.println("Key = " + entry.getKey() + ", Value = " + entry.getValue());
// String str="Key = " + entry.getKey() + ", Value = " + entry.getValue();
String str= entry.getKey() + " " ;
pw.println(str);
}
pw.close();
}
}