基于Snoopy的PHP近似完美获取网站编码的代码 _爱资料
主页 > 编程资料 > PHP >
发布时间:2015-12-29 作者:网络 阅读:206次
先要到网上下载Snoopy.class.php
调用方法:
复制代码 代码如下:
require 'lib/Snoopy.class.php';
require 'lib/WebCrawl.class.php';//包含下面代码
$go=new WebCrawl('http://www.baidu.com');
echo $go->getCharset();
?>

复制代码 代码如下:
class WebCrawl
{
private $url;
private $request;
public $charset_arr=array(
'gb2312',
'utf-8',
'big5',
'gbk',
'ascii',
'cp936',
'ibm037',
'ibm437',
'ibm500',
'asmo-708',
'dos-720',
'ibm737',
'ibm775',
'ibm850',
'ibm852',
'ibm855',
'ibm857',
'ibm00858',
'ibm861',
'ibm860',
'dos-862',
'ibm863',
'ibm864',
'ibm865',
'cp866',
'ibm869',
'ibm870',
'windows-874',
'cp875',
'shift_jis',
'ks_c_5601-1987',
'ibm1026',
'ibm01047',
'ibm01047',
'ibm01040',
'ibm01041',
'ibm01042',
'ibm01043',
'ibm01044',
'ibm01045',
'ibm01046',
'ibm01047',
'ibm01048',
'ibm01049',
'utf-16',
'unicodefffe',
'windows-1250',
'windows-1251',
'windows-1252',
'windows-1253',
'windows-1254',
'windows-1255',
'windows-1256',
'windows-1257',
'windows-1258',
'johab',
'macintosh',
'x-mac-japanese',
'x-mac-chinesetrad',
'x-mac-korean',
'x-mac-arabic',
'x-mac-hebrew',
'x-mac-greek',
'x-mac-cyrillic',
'x-mac-chinesesimp',
'x-mac-romanian',
'x-mac-ukrainian',
'x-mac-thai',
'x-mac-ce',
'x-mac-icelandic',
'x-mac-turkish',
'x-mac-croatian',
'x-chinese-cns',
'x-cp20001',
'x-chinese-eten',
'x-cp20003',
'x-cp20004',
'x-cp20005',
'x-ia5',
'x-ia5-german',
'x-ia5-swedish',
'x-ia5-norwegian',
'us-ascii',
'x-cp20261',
'x-cp20269',
'ibm273',
'ibm277',
'ibm278',
'ibm280',
'ibm284',
'ibm285',
'ibm290',
'ibm420',
'ibm423',
'ibm424',
'x-ebcdic-koreanextended',
'ibm-thai',
'koi8-r',
'ibm871',
'ibm880',
'ibm905',
'ibm00924',
'x-cp20936',
'x-cp20949',
'cp1025',
'koi8-u',
'iso-8859-1',
'iso-8859-2',
'iso-8859-3',
'iso-8859-4',
'iso-8859-5',
'iso-8859-6',
'iso-8859-7',
'iso-8859-8',
'iso-8859-9',
'iso-8859-13',
'iso-8859-15',
'x-europa',
'iso-8859-8-i',
'iso-2022-jp',
'csiso2022jp',
'iso-2022-jp',
'iso-2022-kr',
'x-cp50227',
'euc-jp',
'euc-cn',
'euc-kr',
'hz-gb-2312',
'gb18030',
'x-iscii-de',
'x-iscii-be',
'x-iscii-ta',
'x-iscii-te',
'x-iscii-as',
'x-iscii-or',
'x-iscii-ka',
'x-iscii-ma',
'x-iscii-gu',
'x-iscii-pa',
'utf-7',
'utf-32',
'utf-32be'
);
public function __construct($url)
{
$this->url=$url;
}
//打开网站
private function open($url)
{
if($this->request!==null)
{
if($this->request->status==200)
{
return true;
}
else
{
return false;
}
}
else
{
$this->request=new Snoopy();
$this->request->fetch($url);
if($this->request->status==200)
{
$this->request->results=strtolower($this->request->results);
$charset=$this->getCharset();
if($charset!="utf-8")
{
if($charset=="windows-1252")
{
$this->request->results=$this->uni_decode($this->request->results);
}
else
{
$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset);
}
}
return true;
}
else
{
return false;
}
}
}
//获取网站title,keywords,description
public function getWebinfo()
{
$info=array(
'title'=>'',
'keywords'=>'',
'desc'=>'',
'ip'=>''
);
if(!$this->open($this->url)){return $info;exit;}
// print_r($this->request->results);exit;
preg_match('/([^>]*)<\/title>/si', $this->request->results, $titlematch ); <BR>if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) <BR>{ <BR>$info['title'] = strip_tags($titlematch[1]); <BR>} <BR>preg_match_all('/<[\s]*meta[\s]*name="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <BR>$ft=0; <BR>foreach($match[1] as $mt) <BR>{ <BR>if($mt=="keywords" || $mt=="description") <BR>{ <BR>$ft=1; <BR>} <BR>} <BR>if($ft==0) <BR>{ <BR>preg_match_all('/<[\s]*meta[\s]*content="?([^>"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <BR>if (isset($match) && is_array($match) && count($match) == 3) <BR>{ <BR>$originals = $match[0]; <BR>$names = $match[2]; <BR>$values = $match[1]; <BR>if (count($originals) == count($names) && count($names) == count($values)) <BR>{ <BR>$metaTags = array(); <BR>for ($i=0, $limiti=count($names); $i < $limiti; $i++) <BR>{ <BR>$metaTags[$names[$i]] = array ( <BR>'html' => htmlentities($originals[$i]), <BR>'value' => $values[$i] <BR>); <BR>} <BR>} <BR>} <BR>} <BR>else <BR>{ <BR>if (isset($match) && is_array($match) && count($match) == 3) <BR>{ <BR>$originals = $match[0]; <BR>$names = $match[1]; <BR>$values = $match[2]; <BR>if (count($originals) == count($names) && count($names) == count($values)) <BR>{ <BR>$metaTags = array(); <BR>for ($i=0, $limiti=count($names); $i < $limiti; $i++) <BR>{ <BR>$metaTags[$names[$i]] = array ( <BR>'html' => htmlentities($originals[$i]), <BR>'value' => $values[$i] <BR>); <BR>} <BR>} <BR>} <BR>} <BR>$result = array ( <BR>'metaTags' => $metaTags <BR>); <BR>if(isset($result['metaTags']['keywords']['value'])) <BR>{ <BR>$info['keywords']=$result['metaTags']['keywords']['value']; <BR>} <BR>else <BR>{ <BR>$info['keywords']=""; <BR>} <BR>if(isset($result['metaTags']['description']['value'])) <BR>{ <BR>$info['desc']=$result['metaTags']['description']['value']; <BR>} <BR>else <BR>{ <BR>$info['desc']=""; <BR>} <BR>$domain=preg_replace('/http\:\/\//si', '', $this->url); <BR>$ip=@gethostbyname($domain); <BR>$ip_arr=explode(".", $ip); <BR>if(count($ip_arr)==4) <BR>{ <BR>$info['ip']=$ip; <BR>} <BR>return $info; <BR>} <BR>public function t($string,$o) <BR>{ <BR>for($i=0;$i<strlen($string);$i++) <BR>{ <BR>if(ord($string{$i})<128) <BR>continue; <BR>if((ord($string{$i})&224)==224) <BR>{ <BR>//第一个字节判断通过 <BR>$char = $string{++$i}; <BR>if((ord($char)&128)==128) <BR>{ <BR>//第二个字节判断通过 <BR>$char = $string{++$i}; <BR>if((ord($char)&128)==128) <BR>{ <BR>$encoding = "UTF-8"; <BR>break; <BR>} <BR>} <BR>} <BR>if((ord($string{$i})&192)==192) <BR>{ <BR>//第一个字节判断通过 <BR>$char = $string{++$i}; <BR>if((ord($char)&128)==128) <BR>{ <BR>//第二个字节判断通过 <BR>$encoding = "GB2312"; <BR>break; <BR>} <BR>} <BR>} <BR>return strtolower($encoding); <BR>} <BR>function uni_decode ($str, $code = 'utf-8'){ <BR>$str = json_decode(preg_replace_callback('/&#(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"')); <BR>if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); } <BR>return $str; <BR>} <BR>//获取网站编码 <BR>public function getCharset() <BR>{ <BR>if(!$this->open($this->url)){return false;exit;} <BR>//首先从html获取编码 <BR>preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):""; <BR>if($temp[1]!="") <BR>{ <BR>if(in_array($temp[1], $this->charset_arr)) <BR>{ <BR>if($temp[1]=="gb2312") <BR>{ <BR>$tmp_charset=$this->t($this->request->results,$temp[1]); <BR>if($tmp_charset==$temp[1]) <BR>{ <BR>return $temp[1]; <BR>} <BR>} <BR>else <BR>{ <BR>return $temp[1]; <BR>} <BR>} <BR>} <BR>if(!empty($this->request->headers)) <BR>{ <BR>//从header中获取编码 <BR>$hstr=strtolower(implode("|||",$this->request->headers)); <BR>preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; <BR>if($lang[1]!="") <BR>{ <BR>return $lang[1]; <BR>} <BR>} <BR>$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); <BR>$encoded=mb_detect_encoding($this->request->results,$encode_arr); <BR>if($encoded) <BR>{ <BR>return strtolower($encoded); <BR>} <BR>else <BR>{ <BR>return false; <BR>} <BR>} <BR>} <BR>?> <BR> </div> <div style="padding-top:20px;"></div> <div class='tag'> 关键字词: </div> <div class="clear"></div> <div class='xg'> <h3 class="xiangguantuijian">相关文章</h3> <ul class="xg-ul"> <li><a class="fl" title="php中的php.ini设置session超时" href="https://www.apizl.com/archives/view-148782-1.html">php中的php.ini设置session超时</a></li> <li><a class="fl" title="php去除小数点后面的0,如 0.2300" href="https://www.apizl.com/archives/view-148770-1.html">php去除小数点后面的0,如 0.2300</a></li> <li><a class="fl" title="php使用xpath来进行采集页面的内容" href="https://www.apizl.com/archives/view-134324-1.html">php使用xpath来进行采集页面的内容</a></li> <li><a class="fl" title="php获取网页meta信息(包括title、keywords、description)的两种方法" href="https://www.apizl.com/archives/view-134219-1.html">php获取网页meta信息(包括title、keywords、description)的两种方法</a></li> <li><a class="fl" title="PHP的open_basedir限制目录详解说明" href="https://www.apizl.com/archives/view-134208-1.html">PHP的open_basedir限制目录详解说明</a></li> <li><a class="fl" title="php判断当前的操作系统是linux还是window" href="https://www.apizl.com/archives/view-134206-1.html">php判断当前的操作系统是linux还是window</a></li> <li><a class="fl" title="window下redis和memache扩展的安装" href="https://www.apizl.com/archives/view-134164-1.html">window下redis和memache扩展的安装</a></li> <li><a class="fl" title="thinkphp3.2获取当前操作的模块名控制器方法名称_编程资料分享" href="https://www.apizl.com/archives/view-134050-1.html">thinkphp3.2获取当前操作的模块名控制器方法名称_编程资料分享</a></li> <li><a class="fl" title="PHP 计算两个时间日期相差的天数方法_编程资料分享" href="https://www.apizl.com/archives/view-133968-1.html">PHP 计算两个时间日期相差的天数方法_编程资料分享</a></li> <li><a class="fl" title="php把图片处理成圆形透明的图片,做圆形透明头像,圆形头像_编程资料分享" href="https://www.apizl.com/archives/view-133931-1.html">php把图片处理成圆形透明的图片,做圆形透明头像,圆形头像_编程资料分享</a></li> </ul> </div> </div> <div class="clear"></div> </div> </div> <div id="baidu_div" class="t-10 label-div border-all"> </div> <!--讨论区--> <div id="comment_div" class="t-10 label-div border-all pr-20 pl-20"> <!-- 评论箱 --> </div> <!--讨论区结束--> </div> <!--右侧代码--> <div class="span4"> <!--栏目导航--> <div class="label-div t-5 border-all"> <div class="label-main tody-hot l-15"> <div class="lanmu-div"> <h3 class="label-title border-b h3-h">栏目导航</h3> <ul> <li><a href='https://www.apizl.com/category/list-9.html' target="_blank">PHP</a></li> <li><a href='https://www.apizl.com/category/list-2.html' target="_blank">C#</a></li> <li><a href='https://www.apizl.com/category/list-139.html' target="_blank">Android</a></li> <li><a href='https://www.apizl.com/category/list-322.html' target="_blank">JSP(JAVA)</a></li> <li><a href='https://www.apizl.com/category/list-83.html' target="_blank">前端教程</a></li> <li><a href='https://www.apizl.com/category/list-158.html' target="_blank">Delphi</a></li> <li><a href='https://www.apizl.com/category/list-10.html' target="_blank">Javascript</a></li> <li><a href='https://www.apizl.com/category/list-354.html' target="_blank">Yii</a></li> <li><a href='https://www.apizl.com/category/list-346.html' target="_blank">微信小程序</a></li> <li><a href='https://www.apizl.com/category/list-425.html' target="_blank">百度小程序</a></li> <li><a href='https://www.apizl.com/category/list-326.html' target="_blank">Python</a></li> <li><a href='https://www.apizl.com/category/list-324.html' target="_blank">ASP.NET</a></li> <li><a href='https://www.apizl.com/category/list-65.html' target="_blank">单片机</a></li> <li><a href='https://www.apizl.com/category/list-155.html' target="_blank">WebApp编程</a></li> <li><a href='https://www.apizl.com/category/list-341.html' target="_blank">nodejs</a></li> <li><a href='https://www.apizl.com/category/list-325.html' target="_blank">正则处理</a></li> <li><a href='https://www.apizl.com/category/list-15.html' target="_blank">Dedecms</a></li> <li><a href='https://www.apizl.com/category/list-56.html' target="_blank">ecshop</a></li> <li><a href='https://www.apizl.com/category/list-23.html' target="_blank">phpcms</a></li> <li><a href='https://www.apizl.com/category/list-66.html' target="_blank">Discuz</a></li> <li><a href='https://www.apizl.com/category/list-159.html' target="_blank">帝国CMS</a></li> <li><a href='https://www.apizl.com/category/list-160.html' target="_blank">WordPress</a></li> <li><a href='https://www.apizl.com/category/list-117.html' target="_blank">易语言</a></li> <li><a href='https://www.apizl.com/category/list-356.html' target="_blank">GO语言</a></li> <li><a href='https://www.apizl.com/category/list-357.html' target="_blank">GIT使用</a></li> <li><a href='https://www.apizl.com/category/list-113.html' target="_blank">Arduino</a></li> <li><a href='https://www.apizl.com/category/list-380.html' target="_blank">VBS</a></li> <li><a href='https://www.apizl.com/category/list-381.html' target="_blank">Cmd批处理</a></li> <li><a href='https://www.apizl.com/category/list-327.html' target="_blank">编程更多</a></li> </ul> </div> </div> </div> <div class="label-div t-5 border-all" style="text-align: center;"> <a href="https://www.taomizhan.com/?apizl_view" target="_blank"><img src="/public/youce.jpg"></a> </div> <!--最新文章--> <div class="label-div t-5 border-all"> <div class="label-main tody-hot l-15"> <h3 class="label-title border-b h3-h">最新文章</h3> <ul> <li class="li-w"><a title="yii2中引入三方类库 " href="https://www.apizl.com/archives/view-148792-1.html">yii2中引入三方类库</a></li> <li class="li-w"><a title="thinkphp where中or多个like模糊搜索 " href="https://www.apizl.com/archives/view-148787-1.html">thinkphp where中or多个like模糊搜索</a></li> <li class="li-w"><a title="mysql 启动提示Plugin 'InnoDB' init functi " href="https://www.apizl.com/archives/view-148786-1.html">mysql 启动提示Plugin 'InnoDB' init functi</a></li> <li class="li-w"><a title="php 网站域名被墙判断请求方法 " href="https://www.apizl.com/archives/view-148783-1.html">php 网站域名被墙判断请求方法</a></li> <li class="li-w"><a title="php中的php.ini设置session超时 " href="https://www.apizl.com/archives/view-148782-1.html">php中的php.ini设置session超时</a></li> <li class="li-w"><a title="php 条形图 树状图 使用GD库生成 非js chart " href="https://www.apizl.com/archives/view-148778-1.html">php 条形图 树状图 使用GD库生成 非js chart</a></li> </ul> </div> </div> <!--相关文章--> <div class="label-div t-5 border-all"> <div class="label-main tody-hot l-15"> <h3 class="label-title border-b h3-h">点击排行</h3> <ul> <li class="li-w"><a title="让自己网站对接google谷歌第三方登录接口详解说明 " href="https://www.apizl.com/archives/view-148749-1.html">让自己网站对接google谷歌第三方登录接口详解说明</a></li> <li class="li-w"><a title="shopnc修改原来的后台菜单新增功能(详解) " href="https://www.apizl.com/archives/view-132884-1.html">shopnc修改原来的后台菜单新增功能(详解)</a></li> <li class="li-w"><a title="php实现mysql数据库分表分段备份 " href="https://www.apizl.com/archives/view-47389-1.html">php实现mysql数据库分表分段备份</a></li> <li class="li-w"><a title="sphinx 建立索引这样提示 无法启动FATAL: out of memory (unable to allocate 802217663 bytes) " href="https://www.apizl.com/archives/view-32218-1.html">sphinx 建立索引这样提示 无法启动FATAL: out of memory (unable to allocate 802217663 bytes)</a></li> <li class="li-w"><a title="PhpMyAdmin出现export.php Missing parameter: what /export " href="https://www.apizl.com/archives/view-41905-1.html">PhpMyAdmin出现export.php Missing parameter: what /export</a></li> <li class="li-w"><a title="php使用xpath来进行采集页面的内容 " href="https://www.apizl.com/archives/view-134324-1.html">php使用xpath来进行采集页面的内容</a></li> </ul> </div> </div> <!--推荐文章--> <div class="label-div t-5 border-all"> <div class="label-main tody-hot l-15"> <h3 class="label-title border-b h3-h">推荐文章</h3> <ul> <li class="li-w"><a title="PHP、C# RSA加密交互问题解决方法 " href="https://www.apizl.com/archives/view-134283-1.html">PHP、C# RSA加密交互问题解决方法</a></li> <li class="li-w"><a title="PHP网站被黑下马处理以及防黑大全解读 " href="https://www.apizl.com/archives/view-148757-1.html">PHP网站被黑下马处理以及防黑大全解读</a></li> <li class="li-w"><a title="php下json_encode使用gbk输出时候null无输出内容解决办法 " href="https://www.apizl.com/archives/view-141566-1.html">php下json_encode使用gbk输出时候null无输出内容解决办法</a></li> <li class="li-w"><a title="window下redis和memache扩展的安装 " href="https://www.apizl.com/archives/view-134164-1.html">window下redis和memache扩展的安装</a></li> <li class="li-w"><a title="shopnc数据库处理的事务调用 " href="https://www.apizl.com/archives/view-132892-1.html">shopnc数据库处理的事务调用</a></li> </ul> </div> </div> <!--打赏--> <div class="label-div t-5 border-all"> <div class="label-main tody-hot l-15"> <h3 class="label-title border-b h3-h">打赏</h3> <img src="/public/weixin.png" width="120" alt="weixin"> <img src="/public/alipay.png" width="120" alt="alipay"> </div> </div> </div> </div> </div> <div class="footer"> <div class="clearfix" style="background:#EDEDED;"> </div> <p> <br/> Copyright ©2014 apizl.com    <a href="http://beian.miit.gov.cn/" rel="nofollow" target="_blank">粤ICP备15076105号-1</a>   本站运行: 3986天 <script async src="https://www.googletagmanager.com/gtag/js?id=UA-131433579-3"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'UA-131433579-3'); </script> <br/> <font>网页模板、字体、软件、资料、资源部分是从国外大小网站收集而来,为朋友们在工作或学习时提高效率、节省时间.</font> <BR/> <font>站内所有资源仅供学习与参考,请勿用于商业用途,否则产生的一切后果将由您自己承担! </font> <br/> </p> </div> <script type="text/javascript" src="/js/bioV4.min.js"></script> <script type="text/javascript" src="/js/jquery.lazyload.mini.js"></script> <script type="text/javascript" src="/js/function.js"></script> <script type="text/javascript" src="/js/borsertocss.js"></script> <script src="/public/mobile/index/scripts/postbird-img-glass.js"></script> <link href="/include/ueditor/third-party/SyntaxHighlighter/shCoreDefault.css" rel="stylesheet" type="text/css"/> <script type="text/javascript" src="/include/ueditor/third-party/SyntaxHighlighter/shCore.js"></script> <script type="text/javascript" src="//cpro.baidustatic.com/cpro/ui/c.js" async="async" defer="defer" ></script> <script type="text/javascript"> SyntaxHighlighter.all(); PostbirdImgGlass.init({ domSelector: ".view-content img", animation: true }); </script> <script src="https://www.apizl.com/plus/count.php?view=yes&aid=41387&mid=18" type='text/javascript' language="javascript"></script> <script type="text/javascript"> /**baidu tongji**/ (function () { var hm = document.createElement("script"); hm.src = "//hm.baidu.com/hm.js?5c3af8dba9537e88753b3508ea2eb9fa"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); /**360**/ (function(){ var src = (document.location.protocol == "http:") ? "http://js.passport.qihucdn.com/11.0.1.js?ff16c6ce0bb415bdcb40cdd7adf44451":"https://jspassport.ssl.qhimg.com/11.0.1.js?ff16c6ce0bb415bdcb40cdd7adf44451"; document.write('<script src="' + src + '" id="sozz"><\/script>'); })(); </script> </body> </html>