if($step==1) { $urll=explode('|',$info['url']); $urll=array_filter($urll); vendor('phpQuery.phpQuery'); $uok=''; foreach($urll as $urlll) { $queryURL=$urlll; $cip = '123.125.68.'.mt_rand(0,254); $xip = '125.90.88.'.mt_rand(0,254); //user_agent $useragent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"; //伪造header $header = array('Accept-Language: zh-cn','Connection: Keep-Alive','Cache-Control: no-cache','CLIENT-IP:'.$cip, 'X-FORWARDED-FOR:'.$xip); $ch = curl_init(); curl_setopt($ch, CURLOPT_REFERER, $queryURL); curl_setopt($ch,CURLOPT_HTTPHEADER,$header); curl_setopt ( $ch , CURLOPT_TIMEOUT , 60 ); curl_setopt($ch, CURLOPT_USERAGENT, $useragent); curl_setopt($ch, CURLOPT_URL,$queryURL); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $result = curl_exec($ch); curl_close($ch); if($info['charsets']=='gb2312') $result=iconv("UTF-8", "GB2312//IGNORE",$result); $result=str_ireplace('', ' ', $result); \phpQuery::newDocumentHTML($result,'utf-8'); if($info['html']=='id') $c = pq('#'.$info['name'])->html(); else $c = pq('.'.$info['name'])->html(); $u=match_links($c); $u=$u['link']; foreach($u as $url) { if($info['bh']!='' && $info['bh1']!='') { if(!stristr($url,$info['bh']) || !stristr($url,$info['bh1'])) continue; } else { if($info['bh']!='') { if(!strstr($url,$info['bh'])) continue; } if($info['bh1']!='') { if(!strstr($url,$info['bh1'])) continue; } } if($info['bbh']!='' && $info['bbh1']!='') { if(strstr($url,$info['bbh']) || strstr($url,$info['bbh1'])) continue; } else { if($info['bbh']!='') { if(strstr($url,$info['bbh'])) continue; } if($info['bbh1']!='') { if(strstr($url,$info['bbh1'])) continue; } } if($info['www']!='') $url=$info['www'].$url; $uok[]=$url; } } $uok= array_values(array_unique($uok));//去除数组重复项并且重新建立索引 S('caijiurl',$uok,7200); echo '总共获取到有效待采集网址 '.count($uok).' 条,下一步将进行采集内容操作,请不要关闭或者刷新本页面!'; sleep(1); //header("Location:".U('News/go_collect_project',array('step'=>2)).""); } else if($step==2) { $i=intval(I('i')); //计步器 $caijiurl=S('caijiurl'); $c_caijiurl=count($caijiurl); echo $caijiurl[$i].'
'; $data=$info; $queryURL=$caijiurl[$i]; set_time_limit(0); vendor('phpQuery.phpQuery'); $cip = '123.125.68.'.mt_rand(0,254); $xip = '125.90.88.'.mt_rand(0,254); //user_agent $useragent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"; //伪造header $header = array('Accept-Language: zh-cn','Connection: Keep-Alive','Cache-Control: no-cache','CLIENT-IP:'.$cip, 'X-FORWARDED-FOR:'.$xip); $ch = curl_init(); curl_setopt($ch, CURLOPT_REFERER, $queryURL); curl_setopt($ch,CURLOPT_HTTPHEADER,$header); curl_setopt ( $ch , CURLOPT_TIMEOUT , 60 ); curl_setopt($ch, CURLOPT_USERAGENT, $useragent); curl_setopt($ch, CURLOPT_URL,$queryURL); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $result = curl_exec($ch); curl_close($ch); if($info['charsets']=='gb2312') $result=iconv("UTF-8", "GB2312//IGNORE",$result); $result=str_ireplace('', ' ', $result); \phpQuery::newDocumentHTML($result,'utf-8'); /*title*/ if($data['content_title_html']==0) { $title=pq('title')->html(); $title=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_title_filter'], "", $title)); } else { if($data['content_title_html']=='id') { $title=pq('#'.$data['content_title_name'])->html(); $title=strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_title_filter'], "", $title)); } else { $title=pq('.'.$data['content_title_name'])->html(); $title=strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_title_filter'], "", $title)); } } /*title*/ /*keywords*/ if($data['content_keywords_html']==0) { $keywords=pq($data['content_keywords_name'])->attr('content'); $keywords=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_keywords_filter'], "", $keywords)); } else { if($data['content_keywords_html']=='id') { $keywords=pq('#'.$data['content_keywords_name'])->html(); $keywords=strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_keywords_filter'], "", $title)); } else { $keywords=pq('.'.$data['content_keywords_name'])->html(); $keywords=strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_keywords_filter'], "", $title)); } } /*keywords*/ /*description*/ if($data['content_description_html']==0) { $description=pq($data['content_description_name'])->attr('content'); $description=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_description_filter'], "", $description)); } else { if($data['content_description_html']=='id') { $description=pq('#'.$data['content_description_name'])->html(); $description=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_description_filter'], "", $description)); } else { $description=pq('.'.$data['content_description_name'])->html(); $description=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_description_filter'], "", $description)); } } /*description*/ /*来源*/ if($data['content_source_html']=='id') { $source=pq('#'.$data['content_source_name'])->html(); $source=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_source_filter'], "", $source)); } else if($data['content_source_html']=='class') { $source=pq('.'.$data['content_source_name'])->html(); $source=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_source_filter'], "", $source)); } else { $source=''; } /*来源*/ /*正文内容*/ if($data['content_contenttext_html']=='id') { $content=pq('#'.$data['content_contenttext_name'])->html(); $content=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_contenttext_filter'], "", $content)); } else { $content=pq('.'.$data['content_contenttext_name'])->html(); $content=_strip_tags(array('div','ins','ul','script','object','li','span','iframe','style'),str_replace($data['content_contenttext_filter'], "",$content)); } $content=htmlspecialchars(addslashes($content)); if($info['mypic']==1) { $content=auto_save_image(stripslashess($content)); $content=htmlspecialchars(addslashes($content)); } $c=S('config'); if($info['mylink']==1) { $content=Replace_Links(stripslashess($content),array($c['web_url'],$c['web_m_url'])); $content=htmlspecialchars(addslashes($content)); } $data=''; if($info['mypic_one']==1) { $picarray=auto_return_image(stripslashess($content)); // print_r($picarray); //exit(); $litpic=$picarray[0]; } /*正文内容*/ if($title!='') { $check=M('article_content')->where(array('title'=>$title))->find(); if(!$check) { $odata=''; $odata['title']=$title; $odata['litpic']= $litpic; $odata['keywords']=$keywords; $odata['description']=$description; $odata['source']=$source; $odata['time']=time(); $odata['typeid']=$info['typeid']; $odata['content']=$content; $odata['project_id']=$info['id']; M('collect_content')->add($odata); echo '采集 '.$title.'成功'; } else { echo '采集 '.$title.'失败,标题重复!'; } } else { echo '采集失败,标题为空,请检查本项目的采集规则!'; } $i++; if($i>$c_caijiurl) { S('caijiurl',null); M('collect_project')->where(array('id'=>$info['id']))->save(array('edit_time'=>time())); die('
采集完毕,全部内容已经采入临时库!'); } }