【java工具类】网站安全---将特殊字符编码成为html实体

最新推荐文章于 2023-04-24 17:31:23 发布

原创最新推荐文章于 2023-04-24 17:31:23 发布 · 1.8k 阅读

2 ·

CC 4.0 BY-SA版权

本文介绍了一种用于防御跨站脚本(XSS)攻击的工具类，该工具类能够对用户输入进行有效的实体编码和解码，确保网页内容的安全显示。

上两篇文章已经提到javascript的xss攻击问题，针对于

普通文本

假如我要直接在div里面显示用户的输入的信息，譬如：

[html]view plaincopy 
   
 <div class='userName'><%=userName%></div>  

这种情况，即使用户的userName是：

[html]view plaincopy 
   
 <script>alert("ok");</script>  

我也必须可以显示出来，不能过滤这些字符。

的情况，我们可以根据http://www.w3school.com.cn/html/html_entities.asp

html字符实体这个规范，设定白名单，白名单的字符不转换，白名单以为一律转换，下面就是小弟为了实现这个功能而写的工具类：

【HtmlEnityParser】

package Easis.HTTP.Security;

import java.util.ArrayList;

public class HtmlEnityParser {
    private  String originStr="";
    public ArrayList<String> htmlEnityList=new ArrayList<String>();


    public HtmlEnityParser(String needParser){
        this.originStr=needParser;
        parser(needParser);


    }
    public ArrayList<String> getEnityList(){
        return htmlEnityList;
    }
    private   boolean isNumberChar(char c){
        if(c=='0'||c=='1'||c=='2'||c=='3'||c=='4'||c=='5'||c=='6'||c=='7'||c=='8'||c=='9'){
            return true;
        }
        else{
            return false;
        }
    }

    public void parser(String needCheckStr){


        if(needCheckStr==null||needCheckStr.length()==0||needCheckStr.indexOf("&#")==-1){
            return;
        }
        int cindex_begin=needCheckStr.indexOf("&#");
        int StrLen=needCheckStr.length();
        if(cindex_begin+2>=StrLen){
            return;
        }
        char c_ctmp=needCheckStr.charAt(cindex_begin+2);
        int _c_location=cindex_begin+2;
        if(isNumberChar(c_ctmp)==false){
            //--并非数字，那么就认为这个不是实体，下一个。
            String nextStr=needCheckStr.substring(cindex_begin+2,StrLen);
            parser(nextStr);
            return;
        }

        while (isNumberChar(c_ctmp)==true&&_c_location+1<StrLen){
            _c_location++;
            c_ctmp=needCheckStr.charAt(_c_location);
        }
        //--判断是否遍历完都没发现分号。
        if(_c_location>=StrLen){
            return;//什么都不做
        }
        //--c_temp已经不是数字了，那么就看看是不是；分号，假如是的话，这个就是unicode的实体了，否则，整个废弃，交给下一代处理
        if(c_ctmp==';'){
            //--是实体了，这样处理
            String enityStr=needCheckStr.substring(cindex_begin,_c_location+1);
            if(htmlEnityList.contains(enityStr)==false){
                this.htmlEnityList.add(enityStr);
            }
            String nextStr=needCheckStr.substring(_c_location+1);
            parser(nextStr);
            return;
        }
        else{
            //--那么也交给下一代处理
            String nextStr=needCheckStr.substring(_c_location);
            parser(nextStr);
            return;

        }}

}

【SafeHtml正文】

package Easis.HTTP.Security;

import java.util.ArrayList;

public class SafeHtml {
    /*
    * 危险字符，在url，img，href里面必须过滤的字符。在普通html文本也是必须注意的字符。
    * */
    public  static final char[] dangerousChars= new char[]{'"','<','>','\\','\''};

    public static  final char[] allowChars=
            new char[]{'a','A','b','B','c','C','d','D',
           'e','E','f','F','g','G','h','H','i','I','j','J','k','K','l','L','m','M','o','O',
           'p','P','q','Q','r','R','s','S','t','T','u','U','v','V','w','W','x','X','y','Y',
           'z','Z',',','.','?','/',':',';','[',']','{','}','|','=','+','-','_','(',')','*',
           '&','^','%','$','#','@','!','`','~','\n','\r'};




    /**
     * 是否危险的字符
     * @param c 需判断的字符
     */
    private  static  boolean isDangerousChar(char c){
        for (char ctmp:dangerousChars){
            if(ctmp==c){
                return true;
            }
        }
        return false;
    }
    /*
    * 是否允许的字符
    * */
    private static  boolean isAllowChar(char c){
        for (char c1:allowChars){
            if(c==c1){
                return true;
            }
        }

        return  false;
    }


    //判断汉字第一批--普通中文字，\u4E00到\u9FA5,生冷中文字0x3400--0x4DB5
    private static boolean isChinese(char c){
        int c_code=(int)c;
        //判断汉字第一批--普通中文字，\u4E00到\u9FA5
        if(c_code>=(int)'\u4E00'&&c_code<=(int)'\u9FA5'){
            return true;
        }
        //判断是否中文字---生冷中文字0x3400--0x4DB5
        else if (c_code >= (int)'\u3400' && c_code <= (int)'\u4DB5')
        {
            return true;
        }
        return false;
    }
    //判断是否韩文--普通韩文0xAC00--0xD7A3
    private static boolean isKorean(char c){
        //判断是否韩文--普通韩文0xAC00--0xD7A3
        int theu_code=(int)c;
        if (theu_code >= (int)'\uAC00' && theu_code <= '\uD7A3')
        {
            return true;
        }
        return false;
    }
    //判断是否日文3040-30FF, 31F0-31FF
    private static boolean isJapanese(char c){
        //判断是否日文3040-30FF, 31F0-31FF
        int theu_code=(int)c;
        if (theu_code >= (int)'\u3040' && theu_code <= '\u30FF')
        {
            return true;
        }
        else if (theu_code >= (int)'\u31F0' && theu_code <= '\u31FF')
        {
            return true;
        }
        return  false;
    }
    //--将特殊符号编码成为实体字符。
    public static String HtmlEncode(String htmlStr){
    StringBuilder sb_res=new StringBuilder();
    if(htmlStr==null||htmlStr.length()<=0){
        return  "";
    }
    char[] c_arr=  htmlStr.toCharArray();
    //--逐个逐个字符检查
    for (char c_needCheck:c_arr){
        if(isDangerousChar(c_needCheck)){
            //--转换成为实体
            sb_res.append("&#"+(int)c_needCheck+";");
        }
        else if(isAllowChar(c_needCheck)||isChinese(c_needCheck)||isJapanese(c_needCheck)||isKorean(c_needCheck)){
            sb_res.append(""+c_needCheck);
        }
        else{
            sb_res.append("&#"+(int)c_needCheck+";");
        }

    }
    return sb_res.toString();
    }
    /*
    * 将已经转换的字符串还原。
    * */
    public static String HtmlDecode(String encodedStr){
    StringBuilder sb_res=new StringBuilder();
    //--不知道正则表达式性能如何，但我猜性能不会好，
    //--在。net上我用的是正则实现，所以总觉得性能差，下载额外用其他方式来解析。
    if(encodedStr==null||encodedStr.length()<=0){
    return "";
    }
    else if(encodedStr.indexOf("&#")==-1){
        return  encodedStr;
    }
    String originStr=encodedStr;
    HtmlEnityParser eparser;
        eparser = new HtmlEnityParser(encodedStr);
        ArrayList<String> entylist=eparser.getEnityList();
    for (String enityStr:entylist){
    String   intstr=enityStr.replace("&#","").replace(";","");
    try{
        int i1=Integer.parseInt(intstr);
        originStr=originStr.replace(enityStr,(char)i1+"");
    }
    catch (Exception e){
        e.printStackTrace();
    }

    }
    return originStr;
    }




}

【如何使用】

package TestCase;
import Easis.HTTP.Security.HtmlEnityParser;
import Easis.HTTP.Security.SafeHtml;

import java.util.ArrayList;

public class testSafeHtml {
    public static void main(String[] args){
        String str1="↑↓→ ←↘↘↘↘〇●★★■□「\"':;?mjhfs235g」『』◇dfdf<>ggggggggsdf◇◇◇◇▲Ψsdf78954/*-+{}[]\\◇◇◇<=_09*&^%$REDGJjhfgf>№㊣∑";
        String encodeStr= SafeHtml.HtmlEncode(str1);
        System.out.println(encodeStr);


        String strlist=SafeHtml.HtmlDecode(encodeStr);
        System.out.println(strlist);

        Object ob2=strlist;
    }
}

【为什么不用正则表达式】

因为本人觉得正则效率低。

【应用场合】

这个只能应用在用户留言等地方（因为里面没有针对javascript:这种src及href的注入方式进行考虑）。至于对url，图片地址等字符的过滤，下一篇文章将介绍。