private static StringBuffer getHTML(Node node, boolean check) {
short type = node.getNodeType();
String name = StringUtil.defaultIfBlank(node.getNodeName());
String value2 = node.getNodeValue();
NamedNodeMap attrs = node.getAttributes();
if (check) {
try { // 如果是表格并且没有在表格里,忽略
if ((type == Node.ELEMENT_NODE)
&& "tr".equalsIgnoreCase(name)
&& !"table".equalsIgnoreCase(node.getParentNode()
.getNodeName())) {
type = Node.TEXT_NODE;
} else if ((type == Node.ELEMENT_NODE)
&& "td".equalsIgnoreCase(name)
&& (!"tr".equalsIgnoreCase(node.getParentNode()
.getNodeName()) || !"table"
.equalsIgnoreCase(node.getParentNode()
.getParentNode().getNodeName()))) {
type = Node.TEXT_NODE;
} else if ((type == Node.ELEMENT_NODE)
&& "a".equalsIgnoreCase(name)) { // 如果是链接,则是能是taobao内部链接.
if (attrs != null) {
Node hrefNode = attrs.getNamedItem("href");
if (hrefNode != null) {
String href = hrefNode.getNodeValue();
if ((escapeSpecialHTMLPattern == null)
|| !(new Perl5Matcher()).matches(href,
escapeSpecialHTMLPattern)) {
type = Node.TEXT_NODE;
}
}
}
}
} catch (Exception e) {
type = Node.TEXT_NODE;
}
}
StringBuffer sb = new StringBuffer();
boolean hasVal = false;
if (type == Node.ELEMENT_NODE) {
sb.append("<");
sb.append(StringUtil.toLowerCase(name));
if (attrs != null) {
for (int i = 0; i < attrs.getLength(); i++) {
Node n = attrs.item(i);
if ((n.getNodeName() != null) && (n.getNodeValue() != null)) {
sb.append(" ");
sb.append(StringUtil.toLowerCase(n.getNodeName()));
sb.append("=\"");
sb.append(StringEscapeUtil.escapeHtml(n.getNodeValue()
.trim()));
sb.append("\"");
}
}
}
if (StringUtil.isNotBlank(value2)) {
hasVal = true;
sb.append(">");
sb.append(escapeHTML(value2.trim()));
sb.append("");
}
} else if ((type == Node.TEXT_NODE) && StringUtil.isNotBlank(value2)) {
sb.append(escapeHTML(value2.trim()));
// sb.append("\n");
return sb;
}
Node child = node.getFirstChild();
if ((type == Node.ELEMENT_NODE) && (child == null)) {
if (!hasVal
&& INLINE_CLOSED_TAG.contains(StringUtil.toLowerCase(name))) {
sb.append(" />");
} else if (!hasVal) {
sb.append("></");
sb.append(StringUtil.toLowerCase(name));
sb.append(">");
}
} else {
if (!hasVal && (type == Node.ELEMENT_NODE)) {
sb.append(">");
}
while (child != null) {
sb.append(getHTML(child, check));
child = child.getNextSibling();
}
if (type == Node.ELEMENT_NODE) {
sb.append("</");
sb.append(StringUtil.toLowerCase(name));
sb.append(">");
}
}
return sb;
}
/**
* for display convenience we must cut the length of title if the lenght of
* title greater than ITEM_TITLE_MAX_SIZE, it will return the
* ITEM_TITLE_MAX_SIZE characters of the origianl title, otherwise, the
* whole title.
*
* @param t -
* title
* @param maxSize -
* title max size
*
* @return - title returned
*/
public static String getFormattedTitle(String t, int maxSize) {
StringBuffer title = new StringBuffer();
if (StringUtil.isBlank(t)) {
return "";
}
if (t.length() > maxSize) {
// title = t.trim().substring(0,ITEM_TITLE_MAX_SIZE)+"...";
double weight = 0;
int i = 0;
char c = ' ';
do {
try {
c = t.charAt(i);
} catch (IndexOutOfBoundsException e) {
break;
}
title.append(c);
if (Character.isLetterOrDigit(c)) {
if (Character.getType(c) == Character.OTHER_LETTER) {
weight++; // chinese char weight 1
} else {
weight += 0.5; // english char only weight 0.5
}
} else if (Character.isWhitespace(c)) {
weight += 0.5; // white space char weight 0.5
} else {
weight++; // chinese char weight 1
}
i++;
} while (weight < (maxSize));
if (weight >= maxSize) {
try {
i++;
c = t.charAt(i);
title.append("..."); // still has character, append ...
// to ignore the rest
} catch (IndexOutOfBoundsException e) {
// do nothing;
}
}
} else {
title.append(t.trim());
}
return title.toString();
}
short type = node.getNodeType();
String name = StringUtil.defaultIfBlank(node.getNodeName());
String value2 = node.getNodeValue();
NamedNodeMap attrs = node.getAttributes();
if (check) {
try { // 如果是表格并且没有在表格里,忽略
if ((type == Node.ELEMENT_NODE)
&& "tr".equalsIgnoreCase(name)
&& !"table".equalsIgnoreCase(node.getParentNode()
.getNodeName())) {
type = Node.TEXT_NODE;
} else if ((type == Node.ELEMENT_NODE)
&& "td".equalsIgnoreCase(name)
&& (!"tr".equalsIgnoreCase(node.getParentNode()
.getNodeName()) || !"table"
.equalsIgnoreCase(node.getParentNode()
.getParentNode().getNodeName()))) {
type = Node.TEXT_NODE;
} else if ((type == Node.ELEMENT_NODE)
&& "a".equalsIgnoreCase(name)) { // 如果是链接,则是能是taobao内部链接.
if (attrs != null) {
Node hrefNode = attrs.getNamedItem("href");
if (hrefNode != null) {
String href = hrefNode.getNodeValue();
if ((escapeSpecialHTMLPattern == null)
|| !(new Perl5Matcher()).matches(href,
escapeSpecialHTMLPattern)) {
type = Node.TEXT_NODE;
}
}
}
}
} catch (Exception e) {
type = Node.TEXT_NODE;
}
}
StringBuffer sb = new StringBuffer();
boolean hasVal = false;
if (type == Node.ELEMENT_NODE) {
sb.append("<");
sb.append(StringUtil.toLowerCase(name));
if (attrs != null) {
for (int i = 0; i < attrs.getLength(); i++) {
Node n = attrs.item(i);
if ((n.getNodeName() != null) && (n.getNodeValue() != null)) {
sb.append(" ");
sb.append(StringUtil.toLowerCase(n.getNodeName()));
sb.append("=\"");
sb.append(StringEscapeUtil.escapeHtml(n.getNodeValue()
.trim()));
sb.append("\"");
}
}
}
if (StringUtil.isNotBlank(value2)) {
hasVal = true;
sb.append(">");
sb.append(escapeHTML(value2.trim()));
sb.append("");
}
} else if ((type == Node.TEXT_NODE) && StringUtil.isNotBlank(value2)) {
sb.append(escapeHTML(value2.trim()));
// sb.append("\n");
return sb;
}
Node child = node.getFirstChild();
if ((type == Node.ELEMENT_NODE) && (child == null)) {
if (!hasVal
&& INLINE_CLOSED_TAG.contains(StringUtil.toLowerCase(name))) {
sb.append(" />");
} else if (!hasVal) {
sb.append("></");
sb.append(StringUtil.toLowerCase(name));
sb.append(">");
}
} else {
if (!hasVal && (type == Node.ELEMENT_NODE)) {
sb.append(">");
}
while (child != null) {
sb.append(getHTML(child, check));
child = child.getNextSibling();
}
if (type == Node.ELEMENT_NODE) {
sb.append("</");
sb.append(StringUtil.toLowerCase(name));
sb.append(">");
}
}
return sb;
}
/**
* for display convenience we must cut the length of title if the lenght of
* title greater than ITEM_TITLE_MAX_SIZE, it will return the
* ITEM_TITLE_MAX_SIZE characters of the origianl title, otherwise, the
* whole title.
*
* @param t -
* title
* @param maxSize -
* title max size
*
* @return - title returned
*/
public static String getFormattedTitle(String t, int maxSize) {
StringBuffer title = new StringBuffer();
if (StringUtil.isBlank(t)) {
return "";
}
if (t.length() > maxSize) {
// title = t.trim().substring(0,ITEM_TITLE_MAX_SIZE)+"...";
double weight = 0;
int i = 0;
char c = ' ';
do {
try {
c = t.charAt(i);
} catch (IndexOutOfBoundsException e) {
break;
}
title.append(c);
if (Character.isLetterOrDigit(c)) {
if (Character.getType(c) == Character.OTHER_LETTER) {
weight++; // chinese char weight 1
} else {
weight += 0.5; // english char only weight 0.5
}
} else if (Character.isWhitespace(c)) {
weight += 0.5; // white space char weight 0.5
} else {
weight++; // chinese char weight 1
}
i++;
} while (weight < (maxSize));
if (weight >= maxSize) {
try {
i++;
c = t.charAt(i);
title.append("..."); // still has character, append ...
// to ignore the rest
} catch (IndexOutOfBoundsException e) {
// do nothing;
}
}
} else {
title.append(t.trim());
}
return title.toString();
}