PHP制作百度词典查词采集器 _爱资料
主页 > 编程资料 > PHP >
发布时间:2015-12-29 作者:网络 阅读:196次

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~

 音标
	 *				"pro"	 => 发音
	 *				"example"=> 例句
	 *				"explain"=> 简明释义
	 *				"synonym"=> 同反义词
	 *				"phrase" => 短语数组
	 *			)
   *
	 */
	public function content($word){
		 $this -> word = $word;
		 $symbol = $this -> Pronounced();
		 $pro	 = $this->getSay();
		 $example = $this -> getExample();
		 $explain = $this -> getExplain();
		 $synonym = $this -> getSynonym();
		 $phrase = $this -> getPhrase();
		 $result = array(
				"symbol" => $symbol,		//音标
				"pro"	 => $pro,			//发音
				"example"=> $example,		//例句
				"explain"=> $explain,		//简明释义
				"synonym"=> $synonym,		//同反义词
				"phrase" => $phrase 		//短语数组
			);
		return $result;
	}


	/**
   * 远程获取百度翻译内容
   * get function curl
   * retun string
   *
	 */

	private function getContent(){
 		$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
 		$ch = curl_init();
 		$url = "http://dict.baidu.com/s?wd=".$this->word;
 		curl_setopt($ch, CURLOPT_URL, $url);
 		curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
		curl_setopt($ch, CURLOPT_HTTPGET, 1);
		curl_setopt($ch, CURLOPT_AUTOREFERER,1);
		curl_setopt($ch, CURLOPT_HEADER, 0); 
		curl_setopt($ch, CURLOPT_TIMEOUT, 30);
		$result = curl_exec($ch);
		if (curl_errno($curl)) {
			echo 'Errno'.curl_error($curl);
		}
		curl_close($ch);
		return $result;
	}


	/**
   * 获取百度翻译发音
   * retun array(英,美)
   *
	 */

	private function Pronounced(){
		$data = $this -> getContent();
		preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);
	}

	/**
	 * 获取百度翻译发音
	 * return array(英,美)
	 *
	 */

	private function getSay(){
		$data = $this -> getContent();
		preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);	
	}

	/**
   * 获取百度翻译例句
   * return array() 多维数组 例句
   * 
	 */

	private function getExample(){
		$str = "";
		$data = $this -> getContent();
		preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);
	  $data1 = "[[[".ltrim($example[1][0],"[");
	  $data2 = explode("[[[",$data1);
	  $num = count(array_filter($data2));
		foreach($data2 as $key => $value){
		 	$data3 = explode("[[","[[".$value);
		 	foreach ($data3 as $k => $v) {
		 		preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);
		 		if(!empty($match[1])){
		 			$str .= implode($match[1]," ")."@";
		 		}
		 	}
		}
		$data4 = trim($str,"@");
		$data5 = explode("@", $data4);
		$result = array_chunk($data5, 2);
		return $result;
	}

	/**
   * 获取简明释义
   * return array (x => "词性",b => "附属")
   * 
	 **/

	private function getExplain(){
		$data = $this -> getContent();
		preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\/Us",$data,$explain);
		$r_data = $explain[1][0];
		preg_match_all("/\\(?P.*)\<\/strong\>\(?P.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data);
		preg_match_all("/\(?P[^\>]+)\:\(?P.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data);
		
		$result = array();
		foreach ($a_data["adj"] as $key => $value) {
			$result[$value] = $a_data["name"][$key];
		}
		
		$word_b = array();
		foreach ($b_data["tag"] as $key => $value) {
			$word_b[$value] = strip_tags($b_data["word"][$key]);
		}
		
		$result_data = array("x" => $result,"b" => $word_b);

 		return $result_data;
	}


	/**
   * 获取同义词
   * return array(0 => "同义词", 1 => "反义词") 一般为多维数组
   * 
	 */

	private function getSynonym(){
		$data = $this -> getContent();
		preg_match_all("/id=\"en\-syn\-ant\"\>(.*)/Us",$data,$synonym);
		$content = $synonym[1][0];
		$data1 = explode("", $content);
		$result = array();
		$data2 = array();
		foreach ($data1 as $key => $value) {
			preg_match_all("/\(?P.*)\ \;\<\/strong\>\<\\>\\(?.*)\<\/ul\>/Us", $value, $r_data);
			$data2[$key]["adj"] = $r_data["adj"];
			$data2[$key]["content"] = $r_data["content"];
		}

		foreach ($data2 as $key => $value) {
			foreach ($value["content"] as $k => $v) {
				if(!empty($v)){
					preg_match_all("/\\(?P.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data);
					foreach ($v_data['title'] as $m => $d) {
						$data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m]));
						$result[$key][$value["adj"][$k]][$d] = $data;
					}
				}
			}
		}
 		return $result;
	}

	/**
   * 获取短语词组
   * return array (key => value) 一维或者多维数组
   * 
	 */

	private function getPhrase(){
		$num = self::$num;
		$data = $this -> getContent();
		preg_match_all("/id=\"en\-phrase\"\>(.*)\/Us",$data,$phrase);
		$data = explode("</dd>",$phrase[1][0]);
		$data1 = array_slice($data,0,$num);
		$result = array();
		foreach ($data1 as $key => $value) {
			$data2 = explode("</p>", $value);
			$n = count($data2);
			if($n<=3){
				$result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]);
			}else{
				$data3 = array_slice($data2,0,$n-1);
				$data4 = array_slice($data2,0,2);
				$res = array_diff($data3,$data4);
				$data5 = array_chunk($res,2);
				$key_value = trim(str_replace(" ","",strip_tags($data4[0])));
				$result[$key_value] = strip_tags($data4[1]);
				foreach ($data5 as $key => $value) {
					foreach ($value as $k => $v) {
						$value[$k] = strip_tags($v);
					}
					$array = array($result[$key_value],$value);
					if (array_key_exists($key_value, $result)){
						$result[$key_value] = $array;
					}
				}
				
			}
		}
		return $result;
	}

	/**
	 * 将数组转换为字符串
	 *
	 * @param  array  $data    数组
	 * @param  bool  $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1
	 * @return  string 返回字符串,如果,data为空,则返回空
	 */
	private function array2string($data, $isformdata = 1) {
	  if($data == '') return '';
	  if($isformdata) $data = $this->new_stripslashes($data);
	  return addslashes(var_export($data, TRUE));
	}

	/**
	 * 返回经stripslashes处理过的字符串或数组
	 * @param $string 需要处理的字符串或数组
	 * @return mixed
	 */
	private function new_stripslashes($string) {
	  if(!is_array($string)) return stripslashes($string);
	  foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
	  return $string;
	}

}

// $word = new dict("express");
// $word ->content();</pre>

<p>以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。</p>


                            
                        </div>
                        <div style="padding-top:20px;"></div>
                        
                        <div class='tag'>
                                关键字词:
                        </div>
                        <div class="clear"></div>
                        <div class='xg'>
                            <h3 class="xiangguantuijian">相关文章</h3>
                            <ul class="xg-ul">
                                <li><a class="fl" title="php 批量替换富文本html中src网址" href="https://www.apizl.com/archives/view-148791-1.html">php 批量替换富文本html中src网址</a></li>
<li><a class="fl" title="thinkphp where中or多个like模糊搜索" href="https://www.apizl.com/archives/view-148787-1.html">thinkphp where中or多个like模糊搜索</a></li>
<li><a class="fl" title="php 网站域名被墙判断请求方法" href="https://www.apizl.com/archives/view-148783-1.html">php 网站域名被墙判断请求方法</a></li>
<li><a class="fl" title="php中的php.ini设置session超时" href="https://www.apizl.com/archives/view-148782-1.html">php中的php.ini设置session超时</a></li>
<li><a class="fl" title="php 条形图 树状图 使用GD库生成 非js chart" href="https://www.apizl.com/archives/view-148778-1.html">php 条形图 树状图 使用GD库生成 非js chart</a></li>
<li><a class="fl" title="ThinkPHP(TP) where多条件查询" href="https://www.apizl.com/archives/view-148773-1.html">ThinkPHP(TP) where多条件查询</a></li>
<li><a class="fl" title="php去除小数点后面的0,如 0.2300" href="https://www.apizl.com/archives/view-148770-1.html">php去除小数点后面的0,如 0.2300</a></li>
<li><a class="fl" title="关于Yii2高级版多后台开发实例说明" href="https://www.apizl.com/archives/view-148766-1.html">关于Yii2高级版多后台开发实例说明</a></li>
<li><a class="fl" title="php开启和关闭短标签功能" href="https://www.apizl.com/archives/view-148764-1.html">php开启和关闭短标签功能</a></li>
<li><a class="fl" title="PHP网站被黑下马处理以及防黑大全解读" href="https://www.apizl.com/archives/view-148757-1.html">PHP网站被黑下马处理以及防黑大全解读</a></li>

                            </ul>
                        </div>
                    </div>
                    <div class="clear"></div>

                </div>
            </div>

            <div id="baidu_div" class="t-10 label-div border-all">

            </div>
            <!--讨论区-->
            <div id="comment_div" class="t-10 label-div border-all pr-20 pl-20">
                <!-- 评论箱  -->
                
            </div>
            <!--讨论区结束-->

        </div>


        <!--右侧代码-->
        <div class="span4">
            <!--栏目导航-->
            <div class="label-div t-5 border-all">
                <div class="label-main tody-hot l-15">
                    <div class="lanmu-div">
                        <h3 class="label-title border-b h3-h">栏目导航</h3>
                        <ul>
                            
                            <li><a href='https://www.apizl.com/category/list-9.html' target="_blank">PHP</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-2.html' target="_blank">C#</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-139.html' target="_blank">Android</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-322.html' target="_blank">JSP(JAVA)</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-83.html' target="_blank">前端教程</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-158.html' target="_blank">Delphi</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-10.html' target="_blank">Javascript</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-354.html' target="_blank">Yii</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-346.html' target="_blank">微信小程序</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-425.html' target="_blank">百度小程序</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-326.html' target="_blank">Python</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-324.html' target="_blank">ASP.NET</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-65.html' target="_blank">单片机</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-155.html' target="_blank">WebApp编程</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-341.html' target="_blank">nodejs</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-325.html' target="_blank">正则处理</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-15.html' target="_blank">Dedecms</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-56.html' target="_blank">ecshop</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-23.html' target="_blank">phpcms</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-66.html' target="_blank">Discuz</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-159.html' target="_blank">帝国CMS</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-160.html' target="_blank">WordPress</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-117.html' target="_blank">易语言</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-356.html' target="_blank">GO语言</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-357.html' target="_blank">GIT使用</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-113.html' target="_blank">Arduino</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-380.html' target="_blank">VBS</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-381.html' target="_blank">Cmd批处理</a></li>
                            
                            <li><a href='https://www.apizl.com/category/list-327.html' target="_blank">编程更多</a></li>
                            
                        </ul>
                    </div>
                </div>
            </div>

            <div class="label-div t-5 border-all" style="text-align: center;">
                <a href="https://www.taomizhan.com/?apizl_view" target="_blank"><img src="/public/youce.jpg"></a>
            </div>

            <!--最新文章-->
            <div class="label-div t-5 border-all">
                <div class="label-main tody-hot l-15">
                    <h3 class="label-title border-b h3-h">最新文章</h3>
                    <ul>
                        <li class="li-w"><a title="yii2中引入三方类库 " href="https://www.apizl.com/archives/view-148792-1.html">yii2中引入三方类库</a></li>
<li class="li-w"><a title="thinkphp where中or多个like模糊搜索 " href="https://www.apizl.com/archives/view-148787-1.html">thinkphp where中or多个like模糊搜索</a></li>
<li class="li-w"><a title="mysql 启动提示Plugin 'InnoDB' init functi " href="https://www.apizl.com/archives/view-148786-1.html">mysql 启动提示Plugin 'InnoDB' init functi</a></li>
<li class="li-w"><a title="php 网站域名被墙判断请求方法 " href="https://www.apizl.com/archives/view-148783-1.html">php 网站域名被墙判断请求方法</a></li>
<li class="li-w"><a title="php中的php.ini设置session超时 " href="https://www.apizl.com/archives/view-148782-1.html">php中的php.ini设置session超时</a></li>
<li class="li-w"><a title="php 条形图 树状图 使用GD库生成 非js chart " href="https://www.apizl.com/archives/view-148778-1.html">php 条形图 树状图 使用GD库生成 非js chart</a></li>

                    </ul>
                </div>
            </div>
            <!--相关文章-->
            <div class="label-div t-5 border-all">
                <div class="label-main tody-hot l-15">
                    <h3 class="label-title border-b h3-h">点击排行</h3>
                    <ul>
                        <li class="li-w"><a title="让自己网站对接google谷歌第三方登录接口详解说明 " href="https://www.apizl.com/archives/view-148749-1.html">让自己网站对接google谷歌第三方登录接口详解说明</a></li>
<li class="li-w"><a title="shopnc修改原来的后台菜单新增功能(详解) " href="https://www.apizl.com/archives/view-132884-1.html">shopnc修改原来的后台菜单新增功能(详解)</a></li>
<li class="li-w"><a title="php实现mysql数据库分表分段备份 " href="https://www.apizl.com/archives/view-47389-1.html">php实现mysql数据库分表分段备份</a></li>
<li class="li-w"><a title="sphinx 建立索引这样提示 无法启动FATAL: out of memory (unable to allocate 802217663 bytes) " href="https://www.apizl.com/archives/view-32218-1.html">sphinx 建立索引这样提示 无法启动FATAL: out of memory (unable to allocate 802217663 bytes)</a></li>
<li class="li-w"><a title="PhpMyAdmin出现export.php Missing parameter: what /export " href="https://www.apizl.com/archives/view-41905-1.html">PhpMyAdmin出现export.php Missing parameter: what /export</a></li>
<li class="li-w"><a title="php使用xpath来进行采集页面的内容 " href="https://www.apizl.com/archives/view-134324-1.html">php使用xpath来进行采集页面的内容</a></li>

                    </ul>
                </div>
            </div>
            <!--推荐文章-->
            <div class="label-div t-5 border-all">
                <div class="label-main tody-hot l-15">
                    <h3 class="label-title border-b h3-h">推荐文章</h3>
                    <ul>
                        <li class="li-w"><a title="window下redis和memache扩展的安装 " href="https://www.apizl.com/archives/view-134164-1.html">window下redis和memache扩展的安装</a></li>
<li class="li-w"><a title="PHP、C# RSA加密交互问题解决方法 " href="https://www.apizl.com/archives/view-134283-1.html">PHP、C# RSA加密交互问题解决方法</a></li>
<li class="li-w"><a title="php下json_encode使用gbk输出时候null无输出内容解决办法 " href="https://www.apizl.com/archives/view-141566-1.html">php下json_encode使用gbk输出时候null无输出内容解决办法</a></li>
<li class="li-w"><a title="shopnc数据库处理的事务调用 " href="https://www.apizl.com/archives/view-132892-1.html">shopnc数据库处理的事务调用</a></li>
<li class="li-w"><a title="PHP网站被黑下马处理以及防黑大全解读 " href="https://www.apizl.com/archives/view-148757-1.html">PHP网站被黑下马处理以及防黑大全解读</a></li>

                    </ul>
                </div>
            </div>

            <!--打赏-->
            <div class="label-div t-5 border-all">
                <div class="label-main tody-hot l-15">
                    <h3 class="label-title border-b h3-h">打赏</h3>
                    <img src="/public/weixin.png" width="120" alt="weixin">
                    <img src="/public/alipay.png" width="120" alt="alipay">
                </div>
            </div>
        </div>

    </div>
</div>

<div class="footer">
    <div class="clearfix" style="background:#EDEDED;">
    </div>
    <p>
        <br/>
        Copyright ©2014 apizl.com  
         <a href="http://beian.miit.gov.cn/"  rel="nofollow" target="_blank">粤ICP备15076105号-1</a>
          本站运行:
        3988天
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-131433579-3"></script>
<script>
    window.dataLayer = window.dataLayer || [];
    function gtag(){dataLayer.push(arguments);}
    gtag('js', new Date());

    gtag('config', 'UA-131433579-3');
</script>
        <br/>
        <font>网页模板、字体、软件、资料、资源部分是从国外大小网站收集而来,为朋友们在工作或学习时提高效率、节省时间.</font>
        <BR/>
        <font>站内所有资源仅供学习与参考,请勿用于商业用途,否则产生的一切后果将由您自己承担! </font>
        <br/>
    </p>
</div>


<script type="text/javascript" src="/js/bioV4.min.js"></script>
<script type="text/javascript" src="/js/jquery.lazyload.mini.js"></script>
<script type="text/javascript" src="/js/function.js"></script>
<script type="text/javascript" src="/js/borsertocss.js"></script>
<script src="/public/mobile/index/scripts/postbird-img-glass.js"></script>
<link href="/include/ueditor/third-party/SyntaxHighlighter/shCoreDefault.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="/include/ueditor/third-party/SyntaxHighlighter/shCore.js"></script>
<script type="text/javascript" src="//cpro.baidustatic.com/cpro/ui/c.js" async="async" defer="defer" ></script>
<script type="text/javascript">
    SyntaxHighlighter.all();
    PostbirdImgGlass.init({
        domSelector: ".view-content img",
        animation: true
    });

    

</script>
<script src="https://www.apizl.com/plus/count.php?view=yes&aid=46357&mid=18" type='text/javascript' language="javascript"></script>
<script type="text/javascript">
    /**baidu tongji**/
    (function () {
        var hm = document.createElement("script");
        hm.src = "//hm.baidu.com/hm.js?5c3af8dba9537e88753b3508ea2eb9fa";
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(hm, s);
    })();
    /**360**/
    (function(){
        var src = (document.location.protocol == "http:") ? "http://js.passport.qihucdn.com/11.0.1.js?ff16c6ce0bb415bdcb40cdd7adf44451":"https://jspassport.ssl.qhimg.com/11.0.1.js?ff16c6ce0bb415bdcb40cdd7adf44451";
        document.write('<script src="' + src + '" id="sozz"><\/script>');
    })();
</script>


</body>
</html>