1、校验文件后缀名
2、校验文件的大小
import java.util.Arrays;
import java.util.List;
public interface FileVerifyRule {
/**
* 校验文件后缀名。
* @param extension 文件后缀名
* @return true:允许;false:不允许
*/
default boolean checkExtension(String extension) {
List<String> extensions = getExtensions();
if (extensions == null || extensions.isEmpty()) return true;
return extensions.stream().anyMatch(extension::equalsIgnoreCase);
}
default List<String> getExtensions() { return null; }
/**
* 校验文件大小。
* @param size 文件大小,单位:字节
* @return true:允许;false:不允许
*/
default boolean checkSize(long size) { return true; }
}
3、校验文件内容
4、校验pdf文件是否包含脚本
import cn.hutool.core.io.FileUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.owasp.validator.html.AntiSamy;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import org.xml.sax.ContentHandler;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.List;
import static org.apache.tika.metadata.TikaCoreProperties.RESOURCE_NAME_KEY;
/**
* 文件上传时,验证文件是否合法。
*/
@Slf4j
@Component
public class FileVerifyService {
private final Tika tika = new Tika();
private final FileVerifyRule rule = new FileVerifyRule() {
};
/**
* 验证文件是否合法。满足整个项目的合法性。
*
* @param file 文件
* @return true:合法;false:不合法
*/
public boolean verify(MultipartFile file) {
return verify(file, rule);
}
/**
* 验证文件是否合法。传递校验规则以适用不同的业务场景,否则就需要满足整个项目的合法性。
*
* @param file 文件
* @param rule 校验规则
* @return true:合法;false:不合法
*/
public boolean verify(MultipartFile file, FileVerifyRule rule) {
if (file == null || file.isEmpty()) return false;
if (rule == null) rule = this.rule;
// 校验文件大小
if (!rule.checkSize(file.getSize())) return false;
// 获取文件的后缀名,并校验文件后缀
if(!rule.checkExtension(FileUtil.getSuffix(file.getOriginalFilename()))) return false;
// 校验文件类型
return checkExtension(file, rule);
}
/**
* 校验文件类型
* @param file 文件
* @param rule 规则
* @return true:合法;false:不合法
*/
private boolean checkExtension(MultipartFile file, FileVerifyRule rule) {
try {
String contentType = tika.detect(file.getInputStream());
// log.info("ContentType: {}", contentType); // image/svg+xml
// 获取tika提供的默认参照表
// 可以进行自定义,参照https://stackoverflow.com/questions/13650372/how-to-determine-appropriate-file-extension-from-mime-type-in-java
MimeTypes allTypes = MimeTypes.getDefaultMimeTypes();
// 根据MimeType名称获取MimeType类型
MimeType mimeType = allTypes.forName(contentType);
// log.info("MimeType: {}", contentType); // image/svg+xml
// 根据MimeType类型获取对应的后缀名
String extension = getExtension(mimeType);
String content = getContent(file);
log.info("mime extension: {} {}", mimeType, extension); // .svg
if(!rule.checkExtension(extension)) return false;
if(!checkContent(content)) return false;
//检查pdf是否含有脚本
if("pdf".equals(extension)){
return checkPDFScript(file);
}
return true;
} catch (Exception e) {
log.error("file verify error: {}", e.getMessage());
return false;
}
}
private String getExtension(MimeType mimeType) {
String extra= mimeType.getExtension();
return extra.startsWith(".") ? extra.substring(1) : extra;
}
public boolean checkContent(String inputHtml) {
// 加载Antisamy策略文件
try {
URL url = getClass().getResource("/antisamy-tinymce.xml");
Policy policy = Policy.getInstance(url);
AntiSamy as = new AntiSamy();
// 清理HTML内容
CleanResults cr = as.scan(inputHtml, policy);
// 如果存在错误,则返回清理后的内容
if (cr.getErrorMessages().size() > 0) {
log.error("file verify error: {}", cr.getErrorMessages());
return false;
}
// 返回经过策略验证的HTML内容
//return cr.getCleanHTML();
} catch (Exception e) {
log.error("file verify error: {}", e.getMessage());
}
return true;
}
public String getContent(MultipartFile file) throws Exception {
InputStream inputStream = file.getInputStream();
try {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
metadata.set(RESOURCE_NAME_KEY, file.getOriginalFilename());
parser.parse(inputStream, handler, metadata, new ParseContext());
String text = handler.toString();
return text;
}finally {
inputStream.close();
}
}
//检查pdf文件是否有脚本
private boolean checkPDFScript(MultipartFile file) throws IOException {
try (PDDocument document = PDDocument.load(file.getInputStream())) {
if (!document.isEncrypted()) {
PDDestinationOrAction javascript = document.getDocumentCatalog().getOpenAction();
if (javascript != null) {
if(javascript instanceof PDActionJavaScript) {
return false;
}
String js = javascript.toString();
log.info("JavaScript: " + js);
} else {
log.info("No JavaScript found.");
}
PDFTextStripperByArea stripper = new PDFTextStripperByArea() {
@Override
protected void writeString(String str, List<TextPosition> textPositions) {
for (TextPosition position : textPositions) {
if (position.getUnicode().contains("script")) {
log.info("脚本: {}", position.getUnicode());
} else {
System.out.print(position.getUnicode());
}
}
}
};
stripper.addRegion("script", new Rectangle2D.Double(0, 0, 10000, 10000));
log.info("页数: {}", document.getPages().getCount());
stripper.extractRegions(document.getPage(0));
} else {
log.info("已加密");
}
}
return true;
}
}
<dependency>
<groupId>org.owasp.antisamy</groupId>
<artifactId>antisamy</artifactId>
<version>1.7.6</version>
</dependency>