过滤 外文unicode文本中字符的代码

Author: godspirit

 

This source code is used for filtering unexpected character of a the unicode TXT document. And it will generate formated TXT. e.g. The chinese character would be kicked off from the orignal file.

 

If you want to check for the unicode of certain character, I advice you run "charmap" command, which is useful in Windows command and linux X windows.

 

 

 

 

#include <stdio.h>

#define SPACE (0x20)

#define PORTUGUESE (1)
//#define ITALIAN  (1)
//#define SPANISH  (1)
//#define FRANCH  (1)

static unsigned short LastShort[3] = {0};
unsigned short IsComma(unsigned short data);
static unsigned short ThrowFlag = 0;

void UpdateLastShort(unsigned short current)
{
 int i = 3;
 for(; i>0; i--)
 {
  LastShort[i] = LastShort[i-1];
 }
 LastShort[0] = current;
}

unsigned short IsUnicodeOK(unsigned short current)
{
 if((IsComma(current) && IsComma(LastShort[0])
 ||(current == 0x0d) && ((LastShort[1] == 0x0d)||(LastShort[1] == 0x0a))
 ||(current == 0x0a) && ((LastShort[1] == 0x0a)||(LastShort[1] == 0x0d))
 ||((current == 0x3a) && (LastShort[0] == 0x3a))
 ||((current == 0x22) && (LastShort[0] == 0x22))
 ||((current == 0x27) && (LastShort[0] == 0x27))
 ||((current == 0x2d) && (LastShort[0] == 0x2d))
 ||((current == SPACE) && ((LastShort[0] == SPACE)||(LastShort[0] == 0x0d) || (LastShort[0] == 0x0a))) ))
  return 0;
 UpdateLastShort(current);
  return 1; 

}


unsigned short IsComma(unsigned short data)
{
 unsigned short CurRusCharUni = data;
 if((CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
  || (CurRusCharUni == 0x003b)// ;
  ||((CurRusCharUni == 0x0d)&&(LastShort[0] == 0x0a))
  ||((CurRusCharUni == 0x0a)&&(LastShort[0] == 0x0d))
  || (CurRusCharUni == 0x002c) || (CurRusCharUni == 0x002e))// , .
  return 1;
 else
  return 0;
}

unsigned short IsStop(unsigned short data)
{
        unsigned short CurRusCharUni = data;
        if((CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
         ||((CurRusCharUni == 0x0d)&&(LastShort[0] == 0x0a))
         ||((CurRusCharUni == 0x0a)&&(LastShort[0] == 0x0d))
           || (CurRusCharUni == 0x002e))// .
                return 1;
        else
                return 0;
 
}

unsigned short IsDivision(unsigned short data)
{
 unsigned short CurRusCharUni = data;
    if((CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
  || (CurRusCharUni == 0x000d) || (CurRusCharUni == 0x000a)// /r/n
  || (CurRusCharUni >= 0x0030  && CurRusCharUni <= 0x0039)//Numbers
  || (CurRusCharUni == 0x0022) || (CurRusCharUni == 0x0027)// " '
  || (CurRusCharUni == 0x003f) || (CurRusCharUni == 0x0021)// ? !
  || (CurRusCharUni == 0x003a) || (CurRusCharUni == 0x003b)// : ;
  || (CurRusCharUni >= 0x002c  && CurRusCharUni <= 0x002e)// , - .
  || (CurRusCharUni >= 0x0028  && CurRusCharUni <= 0x0029)
  || (CurRusCharUni == 0x0020))//space
  return 1;
 else
  return 0;
}

/*
if we found a unexpected character, we should kick off the whole word.
*/

unsigned short Put2File(unsigned short current, FILE *fp, unsigned short throwFlag)
{
 FILE *fpNew = fp;
 static unsigned short tempbuf[128] = {0};
 static unsigned short count = 0;
 
 if(throwFlag == 0)
 {
  if(IsUnicodeOK(current) == 0)
   return 1;
  tempbuf[count] = current;
  count ++;
 }
 
 if(IsDivision(current) && (count != 0))
 {
  fwrite(tempbuf,sizeof(unsigned short),count,fpNew);
  count = 0;  
 }
 return 0;
}


unsigned short UnicodeFilter(char * FileName)
{

 FILE *fp = NULL;
 FILE *fpNew = NULL;
 FILE *fpThrow = NULL;
 unsigned long ThrowCnt = 0;
 unsigned short UnicodeHeader = 0;
 unsigned short CurRusCharUni = 0;//每个当前读出的俄文字母的unicode码
 char newFileName[128] = {0};
 char ThrowFileName[128] = {0};
 fp = fopen(FileName,"rb"); 
 if (fp == NULL)
 {
  printf("File Open Failed~!/n");
  return 1;
 }
 fread(&UnicodeHeader,sizeof(unsigned short),1,fp);
 //if satisfy the header requirement of the unicode file
 sprintf(newFileName,"new_%s",FileName);
 sprintf(ThrowFileName,"throw_%s",FileName);
 
 if (UnicodeHeader == 0xfeff)
 {
  fpNew = fopen(newFileName,"ab");
  fpThrow = fopen(ThrowFileName,"ab");
  if ((fpNew == NULL)|| (fpThrow == NULL))
  {
   printf("newFileName Open Failed~!/n");
   return 1;
  }
  
  while(fread(&CurRusCharUni,sizeof(unsigned short),1,fp))
  {
//   printf("Get a char unicode= %x/n",CurRusCharUni);

   if    ((CurRusCharUni >= 0x0041 && CurRusCharUni <= 0x005A) //A-->Z
    || (CurRusCharUni >= 0x0061 && CurRusCharUni <= 0x007A) //a-->z
#if defined(PORTUGUESE)
    || (CurRusCharUni >= 0x00C0 && CurRusCharUni <= 0x00C3) //Portuguese specific
    || (CurRusCharUni >= 0x00E0 && CurRusCharUni <= 0x00E3) //Portuguese specific
    || (CurRusCharUni == 0x00C7) || (CurRusCharUni == 0x00E7) //Portuguese specific
    || (CurRusCharUni == 0x00C9) || (CurRusCharUni == 0x00E9) //Portuguese specific
    || (CurRusCharUni == 0x00CA) || (CurRusCharUni == 0x00EA) //Portuguese specific
    || (CurRusCharUni == 0x00CD) || (CurRusCharUni == 0x00ED) //Portuguese specific
    || (CurRusCharUni >= 0x00D3  && CurRusCharUni <= 0x00D5) //Portuguese specific
    || (CurRusCharUni >= 0x00F3  && CurRusCharUni <= 0x00F5) //Portuguese specific
    || (CurRusCharUni == 0x00DA || CurRusCharUni == 0x00FA) //Portuguese specific
    || (CurRusCharUni == 0x00DC || CurRusCharUni == 0x00FC) //Portuguese specific
#elif defined(ITALIAN)
    || (CurRusCharUni == 0x00C8) || (CurRusCharUni == 0x00E8) //Italian
    || (CurRusCharUni == 0x00C9) || (CurRusCharUni == 0x00E9) //Italian
    || (CurRusCharUni == 0x00D2) || (CurRusCharUni == 0x00F2) //Italian
    || (CurRusCharUni == 0x00D3) || (CurRusCharUni == 0x00F3) //Italian
#elif defined(SPANISH)
    || (CurRusCharUni == 0x00C1) || (CurRusCharUni == 0x00E1)
    || (CurRusCharUni == 0x00C9) || (CurRusCharUni == 0x00E9)
    || (CurRusCharUni == 0x00CD) || (CurRusCharUni == 0x00ED)
    || (CurRusCharUni == 0x00D1) || (CurRusCharUni == 0x00F1)
    || (CurRusCharUni == 0x00D3) || (CurRusCharUni == 0x00F3)
    || (CurRusCharUni == 0x00DA) || (CurRusCharUni == 0x00FA)
    || (CurRusCharUni == 0x00DC) || (CurRusCharUni == 0x00FC)
    
#elif defined(FRANCH)
    || (CurRusCharUni == 0x00C0) || (CurRusCharUni == 0x00E0)
    || (CurRusCharUni == 0x00C2) || (CurRusCharUni == 0x00E2)
    || (CurRusCharUni >= 0x00C7) && (CurRusCharUni <= 0x00CB)
    || (CurRusCharUni >= 0x00E7) && (CurRusCharUni <= 0x00EB)
    || (CurRusCharUni == 0x00CE) || (CurRusCharUni == 0x00EE)
    || (CurRusCharUni == 0x00CF) || (CurRusCharUni == 0x00EF)
    || (CurRusCharUni == 0x00D4) || (CurRusCharUni == 0x00F4)
    || (CurRusCharUni == 0x00DB) || (CurRusCharUni == 0x00FB)
    || (CurRusCharUni == 0x00D9) || (CurRusCharUni == 0x00F9)
    || (CurRusCharUni == 0x00DC) || (CurRusCharUni == 0x00FC)
    || (CurRusCharUni == 0x0178) || (CurRusCharUni == 0x00FF)
#endif
    || IsDivision(CurRusCharUni))// validate character..
   {
    //写入新的文件
    Put2File(CurRusCharUni, fpNew, 0);
   }
   else //Replace some interpunction
   {
    printf("A unexpected char -[0x%x]- detected~/n", CurRusCharUni);
    if (CurRusCharUni == 0xff1f)//?
    {
     CurRusCharUni = 0x003f;
    }
    else if (CurRusCharUni == 0xff1c)//,
    {
     CurRusCharUni = 0x002c;
    }
    else if (CurRusCharUni == 0xff0e)//.
    {
     CurRusCharUni = 0x002e;
    }
    else if (CurRusCharUni == 0xff01)//!
    {
     CurRusCharUni = 0x0021;
    }
    else if ((CurRusCharUni == 0xff0d)||(CurRusCharUni >= 0x2013 && CurRusCharUni <= 0x2016))//-
    {
     CurRusCharUni = 0x002d;
    }
    else if (CurRusCharUni == 0xff1a)//:
    {
     CurRusCharUni = 0x003a;
    }
    else if (CurRusCharUni == 0xff1b)//;
    {
     CurRusCharUni = 0x003b;
    }
    else if ((CurRusCharUni == 0xff07)||(CurRusCharUni == 0x2018)||(CurRusCharUni == 0x2019))//'
    {
     CurRusCharUni = 0x0027;
    }
    else if ((CurRusCharUni == 0xff02)||(CurRusCharUni == 0x201d)||(CurRusCharUni == 0x201c))//"
    {
     CurRusCharUni = 0x0022;
    }
    else
    {
     Put2File(0x20, fpThrow, 1);
     fwrite(&CurRusCharUni,sizeof(unsigned short),1,fpThrow);
     
     ThrowCnt++;
     if (ThrowCnt%50 == 49)
     {
      printf("%d characters have been thrown out!/r/n",ThrowCnt);
     }
     continue;
    }
    Put2File(CurRusCharUni, fpNew, 0);
   }
  }
  fclose(fpThrow);
  fclose(fpNew);
 }

 fclose(fp);

 return 0;
}

#define UPDATE_PARTTIION_FILE /
do{  /
 FileCnt ++; /
 WordCnt = 0; /
 sprintf(partitionFileName, "%d_%s",FileCnt, newFileName); /
 fclose(fpPartition); /
 fpPartition = fopen(partitionFileName,"ab"); /
 fwrite(newline, sizeof(unsigned short), 4, fpFormat); /
 fwrite(newline, sizeof(unsigned short), 4, fpPartition); /
}while(0)

 

unsigned short FilePartition(char * FileName)
{
 FILE *fpNew = NULL;
 FILE *fpPartition = NULL;
 FILE *fpFormat = NULL;
 unsigned long ThrowCnt = 0;
 unsigned short UnicodeHeader = 0;
 unsigned short CurRusCharUni = 0;//个岸s鲰卓的nicode?
 int FileCnt = 1, WordCnt = 0;
 unsigned short newline[4] = {0x0d,0x0a,0x0d,0x0a};
 char newFileName[128] = {0};
 char partitionFileName[128] = {0};
 char FormatFileName[128] = {0};
 
 sprintf(newFileName,"new_%s",FileName);
 sprintf(FormatFileName,"format_%s",FileName);
 sprintf(partitionFileName, "%d_%s",FileCnt,newFileName);

     
 fpPartition = fopen(partitionFileName,"ab");
 fpFormat = fopen(FormatFileName,"ab");
 fpNew = fopen(newFileName,"rb");
    if ((fpNew == NULL)||(fpPartition == NULL)||(fpFormat == NULL))
    {
            printf("FilePartition File Open Failed~!/n");
            return 1;
    }

 
    //fread(&UnicodeHeader,sizeof(unsigned short),1,fp);
    //printf("Get a char unicode= %x/n",UnicodeHeader); 
 
 while((fread(&CurRusCharUni,sizeof(unsigned short),1,fpNew ))&&(fpPartition != NULL))
 {
  fwrite(&CurRusCharUni, sizeof(unsigned short), 1, fpPartition);
  fwrite(&CurRusCharUni, sizeof(unsigned short), 1, fpFormat);
  
  if(WordCnt < 40)
  {
   if(CurRusCharUni == SPACE)
    WordCnt++;
  }
  else if((WordCnt >= 40)&&(WordCnt < 50))
  {
   if(CurRusCharUni == SPACE)
    WordCnt++;
   else if(IsStop(CurRusCharUni))
   {
    FileCnt ++;
    UPDATE_PARTTIION_FILE;
   }
  }
  else if((WordCnt >= 50)&&(WordCnt < 65))
  {
   if(CurRusCharUni == SPACE)
    WordCnt++;
   else if(IsComma(CurRusCharUni))
   {
    UPDATE_PARTTIION_FILE;
   }
  }
  else
  {
   if(CurRusCharUni == SPACE)
   {
    UPDATE_PARTTIION_FILE;
   }
  }
 }
 fclose(fpNew); 
        //if satisfy the header requirement of the unicode file
 return 0;
}

int main(int argc, char * argv[])
{

 unsigned short RltOfFilter = 0;
 char buf[128] ={0};
 if(argc < 2 )
 { 
  printf("Please input the name of TXT file:/n");
  return 0;
 }
 
 sprintf(buf, "rm -rf Output new_%s throw_%s format_%s", argv[1], argv[1], argv[1]);
 system(buf);
 sprintf(buf, "mkdir %s_Output",  argv[1]);
 system(buf);

 RltOfFilter = UnicodeFilter( argv[1] );

 if(RltOfFilter != 0)
 {
  printf("[ERROR]  Format TXT File failed/n");
  return 0;
 }

 RltOfFilter = FilePartition( argv[1] );

 if(RltOfFilter != 0)
 {
         printf("[ERROR]  Partition TXT File failed/n");
         return 0;
 }
 sprintf(buf, "mv *_new_* ./%s_Output",  argv[1]);
 system(buf);
 return 0;
}

 

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值