网站地图    收藏   

主页 > 后端 > php资料库 >

PHP采集系统源代码_自学php网

来源:自学PHP网    时间:2014-12-04 22:12 作者: 阅读:

[导读] 今天公司PHP牛人教了PHP采集系统的原理^_^,太牛了! 代码如下 ?php //获得网页内容 function getFileContents ( $url ) { $user_agent = User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows X...

今天公司PHP牛人教了PHP采集系统的原理^_^,太牛了!

代码如下
<?php

//获得网页内容
function getFileContents($url) {
$user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)";
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if (!empty($urlparts['query']))
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}

if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}

$all = "*/*";

$request = "GET $path HTTP/1.0rnHost: $host$portqrnAccept: $allrnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn";

$fsocket_timeout = 60;
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}


$errno = 0;
$errstr = "";
$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
if (!$fp) {
$contents['state'] = "NOHOST";
print "Error: $errstr";
return $contents;
} else {
if (!fputs($fp, $request)) {
$contents['state'] = "Cannot send request";
return $contents;
}
$data = null;
socket_set_timeout($fp, $fsocket_timeout);
$status = socket_get_status($fp);
while (!feof($fp) && !$status['timed_out']) {
$data .= fgets($fp, 8192);
}
fclose($fp);
if ($status['timed_out'] == 1) {
$contents['state'] = "timeout";
} else{
if(strstr($data,"Location: ")&&strstr($data,"Cache-Control: private")){
$contents['state'] = "jump";
$contents['file'] = substr($data, strpos($data, "rnrn") + 4);
}
else{
$contents['state'] = "ok";
$contents['file'] = substr($data, strpos($data, "rnrn") + 4);
}
}
}

return $contents;
}

/*
检查url文件是否可以读取
check if file is available and in readable form
*/
function url_status($url) {
$user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)";
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if (!empty($urlparts['query']))
$path .= "?".$urlparts['query'];

if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}

if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}

$all = "*/*"; //just to prevent "comment effect" in get accept
$request = "HEAD $path HTTP/1.1rnHost: $host$portqrnAccept: $allrnAccept-Charset: iso-8859-1rnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn";

if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}

$fsocket_timeout = 60;
$errno = 0;
$errstr = "";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);

$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
socket_set_timeout($fp, $fsocket_timeout);
fputs($fp, $request);
$answer = fgets($fp, 4096);
$regs = Array ();
if (ereg("HTTP/[0-9.]+ (([0-9])[0-9]{2})", $answer, $regs)) {
$httpcode = $regs[2];
$full_httpcode = $regs[1];

if ($httpcode <> 2 && $httpcode <> 3) {
$status['state'] = "Unreachable: http $full_httpcode";
$linkstate = "Unreachable";
}
}

if ($linkstate <> "Unreachable") {
while ($answer) {
$answer = fgets($fp, 4096);

if (ereg("Location: *([^nr ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
$status['path'] = $regs[1];
$status['state'] = "Relocation: http $full_httpcode";
fclose($fp);
return $status;
}

if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) {
$status['date'] = $regs[1];
}

if (eregi("Content-Type:", $answer)) {
$content = $answer;
$answer = '';
break;
}
}
$socket_status = socket_get_status($fp);
if (eregi("Content-Type: *([a-z/]*)", $content, $regs)) {
if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
$status['content'] = 'text';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf') {
$status['content'] = 'pdf';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/msword') {
$status['content'] = 'doc';
$status['state'] = 'ok';
} else {
$status['state'] = "Not text or html";
}

} else
if ($socket_status['timed_out'] == 1) {
$status['state'] = "Timed out (no reply from server)";

} else
$status['state'] = "Not text or html";

}
}
fclose($fp);
return $status;
}


$host = 'http://www.admin5.com';
$list_exp = '<div class="itembox"';
$url_start = '<a href="';
$url_end = '" target=';
$detail_title_start = '<h1>';
$detail_title_end = '</h1>';
$detail_summary_start = '<div id="arctext">';
$detail_summary_end = '<div id="arctext">';

$max_page = 179;
for($page=$max_page;$page>0;$page--){

$url = "http://www.admin5.com/browse/26/list_".$page.".shtml";

$status = url_status($url);

if($status['content'] == 'text' && $status['state'] == 'ok'){

$files = getFileContents($url);

$contents = $files['file'];

$arr = explode($list_exp, $contents);

for($i=1;$i<count($arr);$i++){
$detail_url = "";
$detail_url = strstr($arr[$i], $url_start);
$detail_url = str_replace($url_start, "", $detail_url);
$pos = strpos($detail_url, $url_end);
$detail_url = substr($detail_url, 0, $pos);
$detail_url = $host.$detail_url;

$summary = getFileContents($detail_url);

print_r($summary);
exit;
}

}

}

?>

自学PHP网专注网站建设学习,PHP程序学习,平面设计学习,以及操作系统学习

京ICP备14009008号-1@版权所有www.zixuephp.com

网站声明:本站所有视频,教程都由网友上传,站长收集和分享给大家学习使用,如由牵扯版权问题请联系站长邮箱904561283@qq.com

添加评论