import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SeoURLKeyword {
public static String keywordReg = "^(?:http|https)://.+(?:\\.baidu\\.com.*[\\&|\\?](?:wd|word)=" +
"|\\.soso\\.com.*[\\&|\\?]w=" +
"|\\.sogou\\.com.*[\\&|\\?]query=" +
"|\\.bing\\.com.*[\\&|\\?]q=" +
"|\\.youdao\\.com.*[\\&|\\?]q=" +
"|\\.google\\.com.*[\\&|\\?]q=" +
"|\\.360\\.cn.*[\\&|\\?][kw|q]=" +
"|\\.360sou\\.com.*[\\&|\\?](?:kw|q)=" +
"|\\.so\\.com.*[\\&|\\?](?:kw|q)=)([^&]*)";
public static String encodeReg = "^(?:[\\x00-\\x7f]|[\\xfc-\\xff][\\x80-\\xbf]{5}|[\\xf8-\\xfb][\\x80-\\xbf]{4}|[\\xf0-\\xf7][\\x80-\\xbf]{3}|[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xc0-\\xdf][\\x80-\\xbf])+$";
public static String parsePercent(String dataStr) {
if(dataStr.length() <= 1 ){
return "%".equals(dataStr)? "%25":dataStr;
}
StringBuffer buffer = new StringBuffer();
String[] arr = new String[]{};
int sIdx = dataStr.indexOf("%");
int eIdx = dataStr.lastIndexOf("%");
if (sIdx == -1){
return dataStr;
}else{
buffer.append(dataStr.substring(0,sIdx));
arr = dataStr.substring(sIdx+1).split("%");
}
for (int i = 0; i < arr.length; i++) {
if (arr[i].length() < 2) {
buffer.append("%25");
} else {
if(Util.isShiLiu(arr[i].substring(0,2))){
buffer.append("%");
}else{
buffer.append("%25");
}
}
buffer.append(arr[i]);
}
if (eIdx == dataStr.length()-1) {
buffer.append("%25");
}
return buffer.toString();
}//end parsePercent
public static String getKeyword(String url, String defaultKw) {
Pattern keywordPatt = Pattern.compile(keywordReg);
StringBuffer keyword = new StringBuffer(20);
Matcher keywordMat = keywordPatt.matcher(url);
while (keywordMat.find()) {
keywordMat.appendReplacement(keyword, "$1");
}
if (!keyword.toString().equals("")) {
String keywordsTmp = filterPercent25(keyword.toString());
keywordsTmp = parsePercent(keywordsTmp);
Pattern encodePatt = Pattern.compile(encodeReg);
String unescapeString = ParseURLKeyword.unescape(keywordsTmp);
Matcher encodeMat = encodePatt.matcher(unescapeString);
String encodeString = "gbk";
if (encodeMat.matches())
encodeString = "utf-8";
try {
return decode(keywordsTmp, encodeString).trim();
} catch (UnsupportedEncodingException e) {
return defaultKw;
}
}
return defaultKw;
}//end getKeyword
public static String decode(String kw, String encode) throws UnsupportedEncodingException{
if(kw.startsWith("%u")){
kw = unescape(kw);
}
return URLDecoder.decode(kw, encode);
}//end decode
public static String filterPercent25(String str){
String strUnicode = str;
//%25
int n = 0;
while (strUnicode.contains("%25")) {
strUnicode = strUnicode.replace("%25", "%");
n++;
if (n == 3)
break;
}
//+
strUnicode = strUnicode.replace("+", "");
return strUnicode.trim();
}// end filterPercent25
public static String unescape(String src) {
StringBuffer tmp = new StringBuffer();
tmp.ensureCapacity(src.length());
int lastPos = 0, pos = 0;
char ch;
while (lastPos < src.length()) {
pos = src.indexOf("%", lastPos);
if (pos == lastPos) {
if (src.charAt(pos + 1) == 'u') {
ch = (char) Integer.parseInt(src.substring(pos + 2, pos + 6), 16);
tmp.append(ch);
lastPos = pos + 6;
} else {
ch = (char) Integer.parseInt(src.substring(pos + 1, pos + 3), 16);
tmp.append(ch);
lastPos = pos + 3;
}
} else {
if (pos == -1) {
tmp.append(src.substring(lastPos));
lastPos = src.length();
} else {
tmp.append(src.substring(lastPos, pos));
lastPos = pos;
}
}
}
return tmp.toString();
}//end unescape
}
1552

被折叠的 条评论
为什么被折叠?



