Dell Zhang在其博士论文《万维网信息聚类研究》中给出一个使用LSI进行正交聚类的例子。在前面的日志中我已有所提及。
>> A
A =
1 1 1 1 0 0 1 1 0 0 0 1
1 0 1 0 0 1 0 0 0 0 0 0
0 1 0 0 0 0 0 0 1 0 1 1
0 0 1 0 0 0 1 0 0 0 0 0
0 0 1 1 0 0 0 0 1 0 0 0
0 0 0 0 1 1 0 0 0 1 0 0
0 0 0 0 1 1 0 0 0 1 1 0
>> [U,S,V]=svd(A)
U =
-0.7584 0.2501 0.0553 0.4828 -0.1764 0.2900 0.1050
-0.3266 -0.1674 0.4447 -0.1296 0.8021 -0.0801 -0.0303
-0.3194 -0.0429 -0.8296 -0.0784 0.2736 -0.2522 -0.2514
-0.2630 0.0895 0.2877 -0.0901 -0.3509 -0.8277 -0.1538
-0.3334 0.0991 0.0137 -0.8558 -0.2282 0.2958 0.0832
-0.1088 -0.6087 0.1291 0.0517 -0.2371 0.2380 -0.6967
-0.1549 -0.7206 -0.1061 0.0298 -0.1385 -0.1289 0.6394
S =
Columns 1 through 11
3.2057 0 0 0 0 0 0 0 0 0 0
0 2.6127 0 0 0 0 0 0 0 0 0
0 0 1.9945 0 0 0 0 0 0 0 0
0 0 0 1.4900 0 0 0 0 0 0 0
0 0 0 0 1.1704 0 0 0 0 0 0
0 0 0 0 0 1.0192 0 0 0 0 0
0 0 0 0 0 0 0.5389 0 0 0 0
Column 12
0
0
0
0
0
0
0
V =
Columns 1 through 11
-0.3385 0.0316 0.2507 0.2371 0.5346 0.2059 0.1386 -0.2530 0.1037 0.3072 0.4856
-0.3362 0.0793 -0.3882 0.2714 0.0830 0.0370 -0.2717 -0.1987 -0.1965 -0.1145 -0.3338
-0.5245 0.1038 0.4018 -0.3978 0.0398 -0.3160 0.0077 0.1704 -0.4430 0.1557 -0.2071
-0.3406 0.1337 0.0346 -0.2504 -0.3457 0.5747 0.3493 -0.3641 0.1159 -0.2206 -0.1534
-0.0823 -0.5088 0.0115 0.0546 -0.3210 0.1070 -0.1064 -0.0244 -0.5135 -0.2517 0.5237
-0.1841 -0.5729 0.2345 -0.0323 0.3644 0.0285 -0.1626 0.0826 0.3393 -0.4629 -0.2784
-0.3186 0.1300 0.1720 0.2635 -0.4505 -0.5277 -0.0905 -0.1704 0.4430 -0.1557 0.2071
-0.2366 0.0957 0.0277 0.3240 -0.1507 0.2845 0.1949 0.8109 0.1075 -0.0216 0.0284
-0.2036 0.0215 -0.4091 -0.6270 0.0388 0.0428 -0.3121 0.1937 0.3271 0.0649 0.3605
-0.0823 -0.5088 0.0115 0.0546 -0.3210 0.1070 -0.1064 -0.0582 0.1742 0.7146 -0.2453
-0.1480 -0.2922 -0.4692 -0.0326 0.1154 -0.3739 0.7200 -0.0000 0.0000 -0.0000 -0.0000
-0.3362 0.0793 -0.3882 0.2714 0.0830 0.0370 -0.2717 0.0050 -0.1306 0.0495 -0.0268
Column 12
-0.1143
-0.6096
0.0240
0.1107
-0.0596
0.0903
-0.0240
-0.1311
-0.1347
-0.0307
-0.0000
0.7443
而
>> x1
x1 =
-0.7584 0.2501
-0.3266 -0.1674
-0.3194 -0.0429
-0.2630 0.0895
-0.3334 0.0991
-0.1088 -0.6087
-0.1549 -0.7206
>> y1
y1 =
-0.3385 0.0316
-0.3362 0.0793
-0.5245 0.1038
-0.3406 0.1337
-0.0823 -0.5088
-0.1841 -0.5729
-0.3186 0.1300
-0.2366 0.0957
-0.2036 0.0215
-0.0823 -0.5088
-0.1480 -0.2922
-0.3362 0.0793
>> ss
ss =
3.2100 0
0 2.6100
>> (x1*ss*y1')*(x1*ss*y1')'
ans =
6.3529 2.2671 2.4230 2.2078 2.7743 -0.1868 -0.0172
2.2671 1.2901 1.1239 0.7830 1.0090 1.0604 1.3432
2.4230 1.1239 1.0638 0.8394 1.0683 0.5360 0.7205
2.2078 0.7830 0.8394 0.7673 0.9639 -0.0763 -0.0195
2.7743 1.0090 1.0683 0.9639 1.2123 -0.0371 0.0457
-0.1868 1.0604 0.5360 -0.0763 -0.0371 2.6462 3.1620
-0.0172 1.3432 0.7205 -0.0195 0.0457 3.1620 3.7849
此乃词语相似矩阵,但其中的负值如何解释?
LSI是否物理上难以解释?
采用pLSI或者LDA能否解决这个问题?
----------------------
使用LDA可以避免负值问题
并且LDA的话题空间数目的选择可以参考LSI的选择方式。