首页 > 开发 > 综合 > 正文

自动辨别文本是不是utf-8的c#程序

2024-07-21 02:27:15
字体:
来源:转载
供稿:网友
,欢迎访问网页设计爱好者web开发。

private void findnoutffile(string path)
   {
    system.io.streamreader reader = null;
    stringbuilder sb;
    stringbuilder sb2;   
    directoryinfo folder = new system.io.directoryinfo(path);
    directoryinfo[] subfolders = folder.getdirectories();
    for (int i=0;i<subfolders.length;i++)
    {
     findnoutffile(subfolders[i].fullname);
    }
    fileinfo[] subfiles = folder.getfiles();
    for(int j=0;j<subfiles.length ;j++)
    {
     if(checkfiletype(subfiles[j].extension.tolower()))
     {         
      filestream fs = new filestream(subfiles[j].fullname , filemode.open,fileaccess.read);
      sb = new stringbuilder();
      sb2 = new stringbuilder();
      bool butf8 =isutf8(fs);
      fs.close();
      if (!butf8)
      {      
       reader = new system.io.streamreader(subfiles[j].fullname,system.text.encoding.utf8);
       sb2.append(reader.readtoend());
       reader.close();
       reader = new system.io.streamreader(subfiles[j].fullname, system.text.encoding.default,true);    
       sb.append(reader.readtoend());
       reader.close();
   }
     
     
     }
    }
   
   }
 
   //0000 0000-0000 007f - 0xxxxxxx  (ascii converts to 1 octet!)
   //0000 0080-0000 07ff - 110xxxxx 10xxxxxx    ( 2 octet format)
   //0000 0800-0000 ffff - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)

   private static bool isutf8(filestream sbinputstream)
   {
    int   i;
    byte coctets;  // octets to go in this utf-8 encoded character
    byte chr;
    bool  ballascii= true;
    long ilen = sbinputstream.length;

    coctets= 0;
    for( i=0; i < ilen; i++ )
    {
     chr = (byte)sbinputstream.readbyte();

     if( (chr & 0x80) != 0 ) ballascii= false;

     if( coctets == 0 ) 
     {
      if( chr >= 0x80 )
      { 
       do
       {
        chr <<= 1;
        coctets++;
       }
       while( (chr & 0x80) != 0 );

       coctets--;                       
       if( coctets == 0 ) return false; 
      }
     }
     else
     {
      if( (chr & 0xc0) != 0x80 )
      {
       return false;
      }
      coctets--;                      
     }
    }

    if( coctets > 0 )
    { 
     return false;
    }

    if( ballascii )
    {   
     return false;
    }

    return true;

   }
  }
  
 
 }

 

发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表