background image

PHP 获取百度相关搜索和原创的程度

PHP

 

获取百度相关搜索和原创程度

获取原创程度是 get_em_times

 

,数值越高文本长度越长越不原创。

用于采集项目的搜索引擎优化

class baidu {

        public static function build_request_uri($words){
                $words = urlencode(Text::convert_encoding($words,'GBK','UTF-8'));
                return '/s?wd='.$words;
        }
        public static function get_relate_search($words){
                $content = self::get_content($words);
                if(preg_match('/\<div id="rs"\>(((?!\<\/div\>).)*)\<\/div\>/', $content,$result)){
                        if(preg_match_all('/\<a((?!\>).)*\>(.*?)\<\/a\>/', $result[1], $result)) return 
$result[2];
                }
                return FALSE;
        }
        /**
         * 根据空格分词,获取百度关键词完全匹配<em></em>的次数
         * @param string $words
         * @return int 
         */
        public static function get_em_times($words){
                $word = explode(' ', $words);
                $content = self::get_content($words);
                preg_match_all('/\<em\>(((?!\<\/em\>).)*)\<\/em\>/', $content,$result);
                if(!isset($result[1]) || count($result[1])==0)
                        return 0;
                $word_table = array_count_values($result[1]);
                $times = 0;
                foreach ($word as $pice){
                        if(array_key_exists($pice, $word_table)) $times+=$word_table[$pice];
                }
                return $times;
        }
        public static function get_content($words){
                static $loaded_contents = array();
                if(!isset($loaded_contents[$words])){
                                                $content   = 
file_get_contents("http://www.baidu.com".self::build_request_uri($words));
                        $content = Text::convert_encoding($content,'UTF-8','GBK');
                        $loaded_contents[$words] = $content;
                }
                return $loaded_contents[$words];
        }