HTML过滤和补齐(二)

本文介绍了一个用于处理HTML节点的方法,该方法能够根据特定条件将DOM节点转换为HTML字符串,并实现对表格、链接等元素的特殊处理。此外,还提供了一种按长度和权重格式化标题的方法,确保展示效果的同时不丢失关键信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

private static StringBuffer getHTML(Node node, boolean check) {
short type = node.getNodeType();
String name = StringUtil.defaultIfBlank(node.getNodeName());
String value2 = node.getNodeValue();
NamedNodeMap attrs = node.getAttributes();

if (check) {
try { // 如果是表格并且没有在表格里,忽略

if ((type == Node.ELEMENT_NODE)
&& "tr".equalsIgnoreCase(name)
&& !"table".equalsIgnoreCase(node.getParentNode()
.getNodeName())) {
type = Node.TEXT_NODE;
} else if ((type == Node.ELEMENT_NODE)
&& "td".equalsIgnoreCase(name)
&& (!"tr".equalsIgnoreCase(node.getParentNode()
.getNodeName()) || !"table"
.equalsIgnoreCase(node.getParentNode()
.getParentNode().getNodeName()))) {
type = Node.TEXT_NODE;
} else if ((type == Node.ELEMENT_NODE)
&& "a".equalsIgnoreCase(name)) { // 如果是链接,则是能是taobao内部链接.

if (attrs != null) {
Node hrefNode = attrs.getNamedItem("href");

if (hrefNode != null) {
String href = hrefNode.getNodeValue();

if ((escapeSpecialHTMLPattern == null)
|| !(new Perl5Matcher()).matches(href,
escapeSpecialHTMLPattern)) {
type = Node.TEXT_NODE;
}
}
}
}
} catch (Exception e) {
type = Node.TEXT_NODE;
}
}

StringBuffer sb = new StringBuffer();
boolean hasVal = false;

if (type == Node.ELEMENT_NODE) {
sb.append("<");
sb.append(StringUtil.toLowerCase(name));

if (attrs != null) {
for (int i = 0; i < attrs.getLength(); i++) {
Node n = attrs.item(i);

if ((n.getNodeName() != null) && (n.getNodeValue() != null)) {
sb.append(" ");
sb.append(StringUtil.toLowerCase(n.getNodeName()));
sb.append("=\"");
sb.append(StringEscapeUtil.escapeHtml(n.getNodeValue()
.trim()));
sb.append("\"");
}
}
}

if (StringUtil.isNotBlank(value2)) {
hasVal = true;
sb.append(">");
sb.append(escapeHTML(value2.trim()));
sb.append("");
}
} else if ((type == Node.TEXT_NODE) && StringUtil.isNotBlank(value2)) {
sb.append(escapeHTML(value2.trim()));

// sb.append("\n");
return sb;
}

Node child = node.getFirstChild();

if ((type == Node.ELEMENT_NODE) && (child == null)) {
if (!hasVal
&& INLINE_CLOSED_TAG.contains(StringUtil.toLowerCase(name))) {
sb.append(" />");
} else if (!hasVal) {
sb.append("></");
sb.append(StringUtil.toLowerCase(name));
sb.append(">");
}
} else {
if (!hasVal && (type == Node.ELEMENT_NODE)) {
sb.append(">");
}

while (child != null) {
sb.append(getHTML(child, check));
child = child.getNextSibling();
}

if (type == Node.ELEMENT_NODE) {
sb.append("</");
sb.append(StringUtil.toLowerCase(name));
sb.append(">");
}
}

return sb;
}

/**
* for display convenience we must cut the length of title if the lenght of
* title greater than ITEM_TITLE_MAX_SIZE, it will return the
* ITEM_TITLE_MAX_SIZE characters of the origianl title, otherwise, the
* whole title.
*
* @param t -
* title
* @param maxSize -
* title max size
*
* @return - title returned
*/
public static String getFormattedTitle(String t, int maxSize) {
StringBuffer title = new StringBuffer();

if (StringUtil.isBlank(t)) {
return "";
}

if (t.length() > maxSize) {
// title = t.trim().substring(0,ITEM_TITLE_MAX_SIZE)+"...";
double weight = 0;
int i = 0;
char c = ' ';

do {
try {
c = t.charAt(i);
} catch (IndexOutOfBoundsException e) {
break;
}

title.append(c);

if (Character.isLetterOrDigit(c)) {
if (Character.getType(c) == Character.OTHER_LETTER) {
weight++; // chinese char weight 1
} else {
weight += 0.5; // english char only weight 0.5
}
} else if (Character.isWhitespace(c)) {
weight += 0.5; // white space char weight 0.5
} else {
weight++; // chinese char weight 1
}

i++;
} while (weight < (maxSize));

if (weight >= maxSize) {
try {
i++;
c = t.charAt(i);
title.append("..."); // still has character, append ...
// to ignore the rest
} catch (IndexOutOfBoundsException e) {
// do nothing;
}
}
} else {
title.append(t.trim());
}

return title.toString();
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值