package org.apache.nutch.parse.html;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 分析时间戳
 *
 * @author xum
 *
 */
public class PublishTimeExtract {

 private static final String TIME_REGEX = "(:|>|\\s)?20[0-9]{2}(-|/|\\.|\\u5e74)\\d{1,2}(-|/|\\.|\\u6708)\\d{1,2}(\\u65e5)?.\\d{2}(:|\\u65f6)\\d{2}((:|\\u5206)\\d{2})?";
 private static Pattern pattern = Pattern.compile(TIME_REGEX);
 private static SimpleDateFormat sdf = new SimpleDateFormat(
   "yyyy-MM-dd HH:mm:ss");
 private static final String BBS_URL = "(http://bbs/\\..*|http://www\\.tianya\\.cn/[a-zA-Z]*forum/content/.*)";

 /**
  * @param content
  * @param url
  * @return
  */
 public static String extractDate(String content, String url) {

  Matcher m = pattern.matcher(content);
  Date now = new Date();

  // BBS分析最后一个发表时间
  if (url.matches(BBS_URL)) {

   String dateStr = null;

   Date date = null;

   while (m.find()) {

    dateStr = m.group();

    if (dateStr == null)
     continue;

    dateStr = dateStr.trim().replaceAll(">", "");

    if (dateStr.startsWith(":")) {
     dateStr = dateStr.replaceFirst(":", "");
    }

    dateStr = dateStr.replaceAll("\\.|/|\\u5e74|\\u6708|\\u65e5",
      "-");
    dateStr = dateStr.replaceAll("\\u65f6|\\u5206", ":");

    Date tempDate;

    try {
     tempDate = sdf.parse(dateStr);

     if (tempDate.after(now)) {
      continue;
     }

    } catch (ParseException e) {
     continue;
    }

    if (date == null) {
     date = tempDate;
    } else if (tempDate.after(date)) {
     date = tempDate;
    }
   }

   if (date != null) {

    return (date.getTime() + (long) 8 * 3600 * 1000) + "";
   }

  } else { // 新闻网页分析第一个出现的时间

   String dateStr = null;

   if (m.find()) {
    dateStr = m.group();
   }

   if (dateStr != null) {

    dateStr = dateStr.trim().replaceAll(">", "");

    if (dateStr.startsWith(":")) {
     dateStr = dateStr.replaceFirst(":", "");
    }

    dateStr = dateStr.replaceAll("\\.|/|\\u5e74|\\u6708|\\u65e5",
      "-");
    dateStr = dateStr.replaceAll("\\u65f6|\\u5206", ":");

    try {

     return (sdf.parse(dateStr).getTime() + (long) 8 * 3600 * 1000)
       + "";

    } catch (ParseException e) {
     return ((new Date()).getTime() + (long) 8 * 3600 * 1000)
       + "";
    }
   }
  }

  return ((new Date()).getTime() + (long) 8 * 3600 * 1000) + "";
 }

}