php curl爬取网页与thinkphp直接输出图片简单记录

默认分类 PHP CURL thinkphp

简单记录下方便以后查阅。

curl类

首先封装一个curl类供调用,仿照单例模式,(提高资源利用率,加快访问速度?)(至少我自测过单发句柄可以加快速度)
第一个参数传入url(必须),第二个是cookie,第三个是请求数据
2021年6月23日更新这个类

<?php

class Curl
{
    /**
     * @var array 默认的设置参数
     */
    protected static $options = [
        CURLOPT_SSL_VERIFYHOST => false,
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_RETURNTRANSFER => true,
        CURLINFO_HEADER_OUT    => true,
        CURLOPT_HEADER         => true,
        CURLOPT_AUTOREFERER    => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_USERAGENT      => 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13'
            . '(KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
    ];
    /**
     * @var false|resource|null  单例模式存储句柄
     */
    private static $curlInstance = null;
    /**
     * @var false|resource|null  单例模式存储句柄
     */
    private static $curlMultiInstance = null;
    /**
     * @var mixed 请求响应的结果
     */
    public $response;
    /**
     * @var string 请求响应的header
     */
    public $responseHeader;
    /**
     * @var string 请求发起的header
     */
    public $requestHerder;
    /**
     * @var null 请求响应的code状态码
     */
    public $responseCode;
    /**
     * @var false|resource|null  curl句柄
     */
    protected $instance = null;
    /**
     * @var array 请求时发送的数据
     */
    protected $data = [];
    /**
     * @var string 数据类型 为json时会json_encode请求时发送的数据
     */
    protected $dataType;
    /**
     * @var array 请求时发送的header
     */
    protected $header = [];
    /**
     * @var string 请求的url
     */
    protected $url;
    /**
     * @var array 请求时发送的cookie
     */
    protected $cookie = [];
    /**
     * @var string 请求方式Post或者Get两种,默认get
     */
    protected $requestType;

    /**
     * Curl constructor.
     * @param string $url         url链接
     * @param array  $cookie      cookie
     * @param array  $data        请求数据
     * @param array  $header      header头
     * @param string $requestType 请求类型post/get
     * @param string $dataType    数据类型 json/form
     * @param string $contentType contentType
     * @param string $charSet     contentType的字符编码
     */
    public function __construct(
        $url = '',
        $cookie = [],
        $data = [],
        $header = [],
        $requestType = 'GET',
        $dataType = ''
    ) {
        $this->setUrl($url);
        $this->setCookie($cookie);
        $this->setData($data);
        $this->setHeader($header);
        $this->setRequestType($requestType);
        $this->setDataType($dataType);

        return $this;
    }


    public static function create(
        $url = '',
        $cookie = [],
        $data = [],
        $header = [],
        $requestType = 'GET',
        $dataType = ''
    ) {
        return new static($url, $cookie, $data, $header, $requestType, $dataType);
    }

    /**
     * @return false|resource|null 返回存储的句柄
     */
    public static function getInstance()
    {
        if (self::$curlInstance === null) {
            self::$curlInstance = curl_init();
        }

        return self::$curlInstance;
    }

    public static function getMultiInstance()
    {
        if (self::$curlMultiInstance === null) {
            self::$curlMultiInstance = curl_multi_init();
        }

        return self::$curlMultiInstance;
    }

    /**
     * @param mixed $url 设置请求的URL
     * @return $this
     */
    public function setUrl($url): Curl
    {
        $this->url = $url;

        return $this;
    }

    /**
     * @param array $cookie 设置的cookie USER=f3400cb5 写为['USER'=>'f3400cb5']
     * @return $this
     */
    public function setCookie($cookie = []): Curl
    {
        if (is_array($cookie)){
            $this->cookie = array_merge($this->cookie, $cookie);
        }

        return $this;
    }

    /**
     * @param array $data 设置请求时发送的数据
     * @return $this
     */
    public function setData($data = []): Curl
    {
        $this->data = array_merge($this->data, $data);

        return $this;
    }

    /**
     * @param string $dataType 设置请求的数据的类型.为json时会自动json_encode请求数据,设置application/json
     * @return $this
     */
    public function setDataType($dataType = ''): Curl
    {
        $this->dataType = mb_strtolower($dataType);

        return $this;
    }

    /**
     * @param array $header 设置请求的header ['referer(字段)'=>'some_content(内容)']
     * @return $this
     */
    public function setHeader($header = []): Curl
    {
        $this->header = array_merge($this->header, $header);

        return $this;
    }

    /**
     * @param string $requestType 设置请求的类型post或者get,默认get
     * @return $this
     */
    public function setRequestType($requestType = 'GET'): Curl
    {
        $this->requestType = $requestType;

        return $this;
    }

    /**
     * @param string $contentType 设置的content-type
     * @param string $charset     设置的字符编码
     * @return $this
     */
    public function setContentType($contentType = 'application/x-www-form-urlencoded', $charset = 'utf-8'): Curl
    {
        $this->header = array_merge($this->header, ['Content-Type' => $contentType . '; charset=' . $charset]);

        return $this;
    }

    /**
     * $url
     * @return array 生成请求的配置参数
     */
    public function getOption($url): array
    {
        //默认的配置
        $options = $this::$options;
        //设置cookie
        if (!empty($this->cookie)) {
            $cookie = [];
            foreach ($this->cookie as $name => $value) {
                $cookie[] = $name . '=' . $value;
            }
            $options[CURLOPT_COOKIE] = implode('; ', $cookie);
        }
        //json/form数据类型 自动做contentType和requestType设置
        switch ($this->dataType) {
            case 'json':
                $this->setContentType('application/json')->setRequestType('post');
                break;
            case 'form':
                $this->setContentType()->setRequestType('post');
                break;
        }

        if (mb_strtolower($this->requestType) == 'post') {
            //设置请求为post
            $options[CURLOPT_POST] = true;

            //设置Content-Length
            if (!empty($this->data)) {
                if (strtolower($this->dataType) == 'json') {
                    $requestData = json_encode($this->data, JSON_UNESCAPED_UNICODE);
                } else {
                    $requestData = http_build_query($this->data);
                }

                if (strlen($requestData) >= 1024) {
                    $this->setHeader(['Expect' => '']);
                }

                $this->header                = array_merge($this->header, ['Content-Length' => strlen($requestData)]);
                $options[CURLOPT_POSTFIELDS] = $requestData;
            }
        } else {
            $options[CURLOPT_POST] = false;
            if (!empty($this->data)) {
                $url .= '?' . http_build_query($this->data);
            }
        }
        $options[CURLOPT_URL] = $url;
        $header               = [];
        foreach ($this->header as $name => $value) {
            $header[] = $name . ':' . $value;
        }

        $options[CURLOPT_HTTPHEADER] = $header;

        return $options;
    }

    /**
     * @return $this 开始执行请求操作请求的结果及请求头存在当前类的属性中.
     * @throws Exception
     */
    public function exec(): Curl
    {
        if (empty($this->url)) {
            throw new Exception('URL is empty!');
        }

        if (is_array($this->url)) {

            $this->instance = $this::getMultiInstance();
            $urls    = array_values($this->url);
            $urlData = [];                               //定义数据集合
            $count   = count($urls);                     //计算循环次数
            $chArray = [];                               //定义存储句柄数组

            for ($i = 0; $i < $count; $i++) {
                $chArray[$i]             = curl_init();                         //创建句柄
                $options                 = $this->getOption($urls[$i]);         //组装curl配置参数
                $options[CURLOPT_HEADER] = false;
                curl_setopt_array($chArray[$i], $options);                      //加载配置参数
                curl_multi_add_handle($this->instance, $chArray[$i]);           //加载句柄
            }

            $running = null;
            //开始循环执行
            do {
                curl_multi_exec($this->instance, $running);
            } while ($running > 0);

            for ($i = 0; $i < $count; $i++) {
                $urlData[] = curl_multi_getcontent($chArray[$i]);                        //存储数据
                curl_multi_remove_handle($this->instance, $chArray[$i]);        //卸载句柄
            }

            $this->response = $urlData;
        } else {
            $this->instance = $this::getInstance();
            $option = $this->getOption($this->url);

            curl_setopt_array($this->instance, $option);

            //执行并把结果赋值给$response
            $response = curl_exec($this->instance);

            $getInfo = curl_getinfo($this->instance, CURLINFO_HEADER_SIZE);

            //此次访问的header信息
            $this->responseHeader = substr($response, 0, $getInfo);

            //此次访问的页面信息
            $this->response = substr($response, $getInfo);

            //此次访问的页面信息响应状态码
            $this->responseCode = curl_getinfo($this->instance, CURLINFO_HTTP_CODE);

            //requestHeader 请求的头信息
            $this->requestHerder = curl_getinfo($this->instance, CURLINFO_HEADER_OUT);
        }

        $this->restAtt();

        return $this;
    }

    /**
     * @return array 将结果从属性转为数组返回
     */
    public function toArray(): array
    {
        return [
            'response'       => $this->response,
            'responseHeader' => $this->responseHeader,
            'requestHerder'  => $this->requestHerder,
            'responseCode'   => $this->responseCode,
        ];
    }

    /**
     * 清除属性 防止多次重复使用时数值混乱
     */
    private function restAtt()
    {
        $this->data        = [];
        $this->dataType    = '';
        $this->header      = [];
        $this->url         = '';
        $this->cookie      = [];
        $this->requestType = 'GET';
    }
}

获取验证码并储存COOKIE

框架: TP6

public function captcha()
    {
        $user = 1;//用户身份,此处做示范,直接定义为1
        $url = 'http://xxx.xxx.xxx/VerifyCodeHandler.ashx';//验证码URL

        //从数据库取出用户的COOKIE
        $cookieObj = new Cookie();
        $cookie = $cookieObj->findCookieByUser($user);

        //访问网页
        $curl = Curl::create($url, $cookie->toArray())->exec();

        //此处直接输出图片
        return response($curl->response, 200, ['Content-Length' => strlen($response['body'])])->contentType('image/png');
    }

批量访问URL

class Test
{
   /**
     * @param  array  $urls
     * @param  array  $cookie
     *
     * @return array
     * Description 传入一个url数组 获取URL内容并返回内容
     */
    public function getUrlData(array $urls, array $cookie): array
    {
        $return_data = [];                               //定义数据集合
        $count       = count($urls);                     //计算循环次数
        $ch_arr      = [];                               //定义存储句柄数组

        for ($int_for = 0; $int_for < $count; $int_for++) {
            $current_url             = $urls[$int_for];                              //取到url
            $ch_arr[$int_for]        = curl_init();                                  //创建句柄
            $options                 = Str::getOptions($current_url, $cookie);       //组装curl配置参数
            $options[CURLOPT_HEADER] = false;
            curl_setopt_array($ch_arr[$int_for], $options);                                  //加载配置参数
            curl_multi_add_handle(Curl::getMultiInstance(), $ch_arr[$int_for]);              //加载句柄
        }

        $running = null;
        //开始循环执行
        do {
            curl_multi_exec(Curl::getMultiInstance(), $running);
        } while ($running > 0);

        for ($int_for = 0; $int_for < $count; $int_for++) {
            $return_data[] = curl_multi_getcontent($ch_arr[$int_for]);                    //存储数据
            curl_multi_remove_handle(Curl::getMultiInstance(), $ch_arr[$int_for]);        //卸载句柄
        }

        return $return_data;
    }
}

$url_array = [ 'https://www.baidu.com?111', 'https://www.baidu.com?222', 'https://www.baidu.com?333'];
//从数据库取出用户的COOKIE
$cookie =(new Cookie())->findCookieByUser($user);
$result_data = (new Test)->getUrlData($url_array, $cookie);
curl_multi_close(Curl::getMultiInstance());//关闭句柄

新评论

称呼不能为空
邮箱格式不合法
网站格式不合法
内容不能为空