这几天关注了一下PHP的采集程序,才发现用PHP采集内容是这么方便,把经常用到的采集函数在这里总结一下,方便以后使用!
获取所有链接内容和地址
function getAllURL($code){preg_match_all( / a/s+href=[ |/ ]?([^ / ]+)[ |/ ]?/s*[^ ]* ([^ ]+) //a /i ,$code,$arr);return array( name = $arr[2], url = $arr[1]);}
获取所有的图片地址
function getImgSrc($code){$reg = /]*src=/ (http:////(.+)//(.+)/.(jpg|gif|bmp|bnp|png))/ /isU preg_match_all($reg, $code, $img_array, PREG_PATTERN_ORDER);return $img_array[1];}
当前的脚本网址
function getSelfURL(){if(!empty($_SERVER[ REQUEST_URI ])){$scriptName = $_SERVER[ REQUEST_URI $nowurl = $scriptName;}else{$scriptName = $_SERVER[ PHP_SELF if(empty($_SERVER[ QUERY_STRING ])) $nowurl = $scriptName;else $nowurl = $scriptName. ? .$_SERVER[ QUERY_STRING return $nowurl;}
把全角数字转为半角数字
function getAlabNum($fnum){$nums = array( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 $fnums = 0123456789 for($i=0;$i $i++) $fnum = str_replace($nums[$i],$fnums[$i],$fnum);$fnum = ereg_replace( [^0-9/.]|^0{1,} , ,$fnum);if($fnum== ) $fnum=0;return $fnum;}
去除HTML标记
function text2Html($txt){$txt = str_replace( , ,$txt);$txt = str_replace( , ,$txt);$txt = str_replace( , ,$txt);$txt = preg_replace( /[/r/n]{1,}/isU , br/ /r/n ,$txt);return $txt;}
清除HTML标记
function clearHtml($str){$str = str_replace( , ,$str);$str = str_replace( , ,$str);return $str;}
相对路径转化成绝对路径
function relative2Absolute($content, $feed_url) {preg_match( /(http|https|ftp):///// , $feed_url, $protocol);$server_url = preg_replace( /(http|https|ftp|news):///// , , $feed_url);$server_url = preg_replace( ///.*/ , , $server_url);if ($server_url == ) {return $content;if (isset($protocol[0])) {$new_content = preg_replace( /href= /// , href= .$protocol[0].$server_url. / , $content);$new_content = preg_replace( /src= /// , src= .$protocol[0].$server_url. / , $new_content);} else {$new_content = $content;return $new_content;}
获取指定标记中的内容
function getTagData($str, $start, $end){if ( $start == || $end == ){return;$str = explode($start, $str);$str = explode($end, $str[1]);return $str[0];}
HTML表格的每行转为CSV格式数组
function getTrArray($table) {$table = preg_replace( td[^ ]*? si , ,$table);$table = str_replace( /td , , ,$table);$table = str_replace( /tr , {tr} ,$table);//去掉 HTML 标记$table = preg_replace( [///!]*?[^ ]*? si , ,$table);//去掉空白字符$table = preg_replace( ([/r/n])[/s]+ , ,$table);$table = str_replace( , ,$table);$table = str_replace( , ,$table);$table = explode( ,{tr} ,$table);array_pop($table);return $table;}
将HTML表格的每行每列转为数组,采集表格数据
function getTdArray($table) {$table = preg_replace( table[^ ]*? si , ,$table);$table = preg_replace( tr[^ ]*? si , ,$table);$table = preg_replace( td[^ ]*? si , ,$table);$table = str_replace( /tr , {tr} ,$table);$table = str_replace( /td , {td} ,$table);//去掉 HTML 标记$table = preg_replace( [///!]*?[^ ]*? si , ,$table);//去掉空白字符$table = preg_replace( ([/r/n])[/s]+ , ,$table);$table = str_replace( , ,$table);$table = str_replace( , ,$table);$table = explode( {tr} , $table);array_pop($table);foreach ($table as $key= $tr) {$td = explode( {td} , $tr);array_pop($td);$td_array[] = $td;return $td_array;}
返回字符串中的所有单词 $distinct=true 去除重复
function splitEnStr($str,$distinct=true) {preg_match_all( /([a-zA-Z]+)/ ,$str,$match);if ($distinct == true) {$match[1] = array_unique($match[1]);sort($match[1]);return $match[1];}
相关推荐:
PHP采集程序中常用的函数
php 使用CURL函数采集
以上就是php中常用的采集函数的总结(附代码)的详细内容,PHP教程
郑重声明:本文版权归原作者所有,转载文章仅为传播更多信息之目的,如作者信息标记有误,请第一时间联系我们修改或删除,多谢。
新闻热点
疑难解答