欢迎访问web学习网
当前位置:主页 > PHP > 正文

php使用pthreads多线程和单进程采集网页速度对比

05-15 PHP

class test_thread_run extends Thread
{
    public $url;
    public $data;
 
    public function __construct($url)
    {
        $this->url = $url;
    }
 
    public function run()
    {
        if(($url = $this->url))
        {
            $this->data = model_http_curl_get($url);
        }
    }
}
 
function model_thread_result_get($urls_array)
{
    foreach ($urls_array as $key => $url)
    {
        $thread_array[$key] = new test_thread_run($url);
        $thread_array[$key]->start();
    }
 
    foreach ($thread_array as $thread_array_key => $thread_array_value)
    {
        if($thread_array[$thread_array_key]->join())//让当前执行上下文等待被引用线程执行完毕
        {
            $variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
        }
    }
    return $variable_data;
}
 
function model_http_curl_get($url,$userAgent="")
{
    $userAgent = $userAgent ? $userAgent : 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_TIMEOUT, 5);
    curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
    $result = curl_exec($curl);
    curl_close($curl);
    return $result;
}
 
 
function model_curl_multi_get($urls) //模拟多线程
{
    $conn=[];
    $mh = curl_multi_init();
    foreach ($urls as $i => $url) {
        $conn[$i] = curl_init($url);
        curl_setopt($conn[$i], CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
        curl_setopt($conn[$i], CURLOPT_HEADER, 0);
        curl_setopt($conn[$i], CURLOPT_CONNECTTIMEOUT, 60);
        curl_setopt($conn[$i], CURLOPT_RETURNTRANSFER, 1);
        curl_multi_add_handle($mh, $conn[$i]);
    }
    $active = null;
    do {
        $mrc = curl_multi_exec($mh, $active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);
 
    while ($active and $mrc == CURLM_OK) {
 
        if (curl_multi_select($mh) === -1) {
            usleep(100);
        }
        do {
            $mrc = curl_multi_exec($mh, $active);
        } while ($mrc == CURLM_CALL_MULTI_PERFORM);
 
    }
 
    $data = [];
    foreach ($urls as $i => $url) {
        $code = curl_getinfo($conn[$i], CURLINFO_HTTP_CODE);
        if ($code == 200) {
            $content = curl_multi_getcontent($conn[$i]);
            $data[$i] = $content;
        } else {
            $data[$i] = model_http_curl_get($url);
        }
 
        curl_multi_remove_handle($mh, $conn[$i]);
        curl_close($conn[$i]);
    }
 
    curl_multi_close($mh);
    return $data;
}
 
 
function writeArr($arr,$file)
{
    $arr = var_export($arr, true);
    file_put_contents($file, $arr . "\r\n", FILE_APPEND);
}
 
 
 
for ($i=1; $i <= 50; $i++)
{
    $urls_array[] = "http://www.ireader.com/index.php?ca=booksort.index&pca=booksort.index&pid=10&cid=11&order=download&status=0&page=".$i;
}
 
$t = microtime(true);
$result = model_thread_result_get($urls_array);
$e = microtime(true);
echo "多线程耗时:".($e-$t)."\n";
writeArr($result,'res1.txt');
 
 
$t = microtime(true);
foreach ($urls_array as $key => $url)
{
   $result_new[$key] = model_http_curl_get($url);
}
$e = microtime(true);
echo "单进程耗时:".($e-$t)."\n";
writeArr($result_new,'res2.txt');
 
 
$t = microtime(true);
$result = model_curl_multi_get($urls_array);
$e = microtime(true);
echo "模拟多线程耗时:".($e-$t)."\n";
writeArr($result,'res3.txt');

采集掌阅50页的书籍的数据,单位为秒(s) ,执行结果如下:


可以看出多线程和模拟多线程采集的速度明显要快于单进程

 

文章来源: 本站
打赏

取消

感谢您的支持,我会继续努力的!

扫码支持
扫码打赏,你说多少就多少

打开支付宝扫一扫,即可进行扫码打赏哦