php curl爬取网页与thinkphp直接输出图片简单记录
简单记录下方便以后查阅。
curl类
首先封装一个curl类供调用,仿照单例模式,(提高资源利用率,加快访问速度?)(至少我自测过单发句柄可以加快速度)第一个参数传入url(必须),第二个是cookie,第三个是请求数据
2021年6月23日更新这个类
<?php
class Curl
{
/**
* @var array 默认的设置参数
*/
protected static $options = [
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_RETURNTRANSFER => true,
CURLINFO_HEADER_OUT => true,
CURLOPT_HEADER => true,
CURLOPT_AUTOREFERER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13'
. '(KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
];
/**
* @var false|resource|null 单例模式存储句柄
*/
private static $curlInstance = null;
/**
* @var false|resource|null 单例模式存储句柄
*/
private static $curlMultiInstance = null;
/**
* @var mixed 请求响应的结果
*/
public $response;
/**
* @var string 请求响应的header
*/
public $responseHeader;
/**
* @var string 请求发起的header
*/
public $requestHerder;
/**
* @var null 请求响应的code状态码
*/
public $responseCode;
/**
* @var false|resource|null curl句柄
*/
protected $instance = null;
/**
* @var array 请求时发送的数据
*/
protected $data = [];
/**
* @var string 数据类型 为json时会json_encode请求时发送的数据
*/
protected $dataType;
/**
* @var array 请求时发送的header
*/
protected $header = [];
/**
* @var string 请求的url
*/
protected $url;
/**
* @var array 请求时发送的cookie
*/
protected $cookie = [];
/**
* @var string 请求方式Post或者Get两种,默认get
*/
protected $requestType;
/**
* Curl constructor.
* @param string $url url链接
* @param array $cookie cookie
* @param array $data 请求数据
* @param array $header header头
* @param string $requestType 请求类型post/get
* @param string $dataType 数据类型 json/form
* @param string $contentType contentType
* @param string $charSet contentType的字符编码
*/
public function __construct(
$url = '',
$cookie = [],
$data = [],
$header = [],
$requestType = 'GET',
$dataType = ''
) {
$this->setUrl($url);
$this->setCookie($cookie);
$this->setData($data);
$this->setHeader($header);
$this->setRequestType($requestType);
$this->setDataType($dataType);
return $this;
}
public static function create(
$url = '',
$cookie = [],
$data = [],
$header = [],
$requestType = 'GET',
$dataType = ''
) {
return new static($url, $cookie, $data, $header, $requestType, $dataType);
}
/**
* @return false|resource|null 返回存储的句柄
*/
public static function getInstance()
{
if (self::$curlInstance === null) {
self::$curlInstance = curl_init();
}
return self::$curlInstance;
}
public static function getMultiInstance()
{
if (self::$curlMultiInstance === null) {
self::$curlMultiInstance = curl_multi_init();
}
return self::$curlMultiInstance;
}
/**
* @param mixed $url 设置请求的URL
* @return $this
*/
public function setUrl($url): Curl
{
$this->url = $url;
return $this;
}
/**
* @param array $cookie 设置的cookie USER=f3400cb5 写为['USER'=>'f3400cb5']
* @return $this
*/
public function setCookie($cookie = []): Curl
{
if (is_array($cookie)){
$this->cookie = array_merge($this->cookie, $cookie);
}
return $this;
}
/**
* @param array $data 设置请求时发送的数据
* @return $this
*/
public function setData($data = []): Curl
{
$this->data = array_merge($this->data, $data);
return $this;
}
/**
* @param string $dataType 设置请求的数据的类型.为json时会自动json_encode请求数据,设置application/json
* @return $this
*/
public function setDataType($dataType = ''): Curl
{
$this->dataType = mb_strtolower($dataType);
return $this;
}
/**
* @param array $header 设置请求的header ['referer(字段)'=>'some_content(内容)']
* @return $this
*/
public function setHeader($header = []): Curl
{
$this->header = array_merge($this->header, $header);
return $this;
}
/**
* @param string $requestType 设置请求的类型post或者get,默认get
* @return $this
*/
public function setRequestType($requestType = 'GET'): Curl
{
$this->requestType = $requestType;
return $this;
}
/**
* @param string $contentType 设置的content-type
* @param string $charset 设置的字符编码
* @return $this
*/
public function setContentType($contentType = 'application/x-www-form-urlencoded', $charset = 'utf-8'): Curl
{
$this->header = array_merge($this->header, ['Content-Type' => $contentType . '; charset=' . $charset]);
return $this;
}
/**
* $url
* @return array 生成请求的配置参数
*/
public function getOption($url): array
{
//默认的配置
$options = $this::$options;
//设置cookie
if (!empty($this->cookie)) {
$cookie = [];
foreach ($this->cookie as $name => $value) {
$cookie[] = $name . '=' . $value;
}
$options[CURLOPT_COOKIE] = implode('; ', $cookie);
}
//json/form数据类型 自动做contentType和requestType设置
switch ($this->dataType) {
case 'json':
$this->setContentType('application/json')->setRequestType('post');
break;
case 'form':
$this->setContentType()->setRequestType('post');
break;
}
if (mb_strtolower($this->requestType) == 'post') {
//设置请求为post
$options[CURLOPT_POST] = true;
//设置Content-Length
if (!empty($this->data)) {
if (strtolower($this->dataType) == 'json') {
$requestData = json_encode($this->data, JSON_UNESCAPED_UNICODE);
} else {
$requestData = http_build_query($this->data);
}
if (strlen($requestData) >= 1024) {
$this->setHeader(['Expect' => '']);
}
$this->header = array_merge($this->header, ['Content-Length' => strlen($requestData)]);
$options[CURLOPT_POSTFIELDS] = $requestData;
}
} else {
$options[CURLOPT_POST] = false;
if (!empty($this->data)) {
$url .= '?' . http_build_query($this->data);
}
}
$options[CURLOPT_URL] = $url;
$header = [];
foreach ($this->header as $name => $value) {
$header[] = $name . ':' . $value;
}
$options[CURLOPT_HTTPHEADER] = $header;
return $options;
}
/**
* @return $this 开始执行请求操作请求的结果及请求头存在当前类的属性中.
* @throws Exception
*/
public function exec(): Curl
{
if (empty($this->url)) {
throw new Exception('URL is empty!');
}
if (is_array($this->url)) {
$this->instance = $this::getMultiInstance();
$urls = array_values($this->url);
$urlData = []; //定义数据集合
$count = count($urls); //计算循环次数
$chArray = []; //定义存储句柄数组
for ($i = 0; $i < $count; $i++) {
$chArray[$i] = curl_init(); //创建句柄
$options = $this->getOption($urls[$i]); //组装curl配置参数
$options[CURLOPT_HEADER] = false;
curl_setopt_array($chArray[$i], $options); //加载配置参数
curl_multi_add_handle($this->instance, $chArray[$i]); //加载句柄
}
$running = null;
//开始循环执行
do {
curl_multi_exec($this->instance, $running);
} while ($running > 0);
for ($i = 0; $i < $count; $i++) {
$urlData[] = curl_multi_getcontent($chArray[$i]); //存储数据
curl_multi_remove_handle($this->instance, $chArray[$i]); //卸载句柄
}
$this->response = $urlData;
} else {
$this->instance = $this::getInstance();
$option = $this->getOption($this->url);
curl_setopt_array($this->instance, $option);
//执行并把结果赋值给$response
$response = curl_exec($this->instance);
$getInfo = curl_getinfo($this->instance, CURLINFO_HEADER_SIZE);
//此次访问的header信息
$this->responseHeader = substr($response, 0, $getInfo);
//此次访问的页面信息
$this->response = substr($response, $getInfo);
//此次访问的页面信息响应状态码
$this->responseCode = curl_getinfo($this->instance, CURLINFO_HTTP_CODE);
//requestHeader 请求的头信息
$this->requestHerder = curl_getinfo($this->instance, CURLINFO_HEADER_OUT);
}
$this->restAtt();
return $this;
}
/**
* @return array 将结果从属性转为数组返回
*/
public function toArray(): array
{
return [
'response' => $this->response,
'responseHeader' => $this->responseHeader,
'requestHerder' => $this->requestHerder,
'responseCode' => $this->responseCode,
];
}
/**
* 清除属性 防止多次重复使用时数值混乱
*/
private function restAtt()
{
$this->data = [];
$this->dataType = '';
$this->header = [];
$this->url = '';
$this->cookie = [];
$this->requestType = 'GET';
}
}
获取验证码并储存COOKIE
框架: TP6
public function captcha()
{
$user = 1;//用户身份,此处做示范,直接定义为1
$url = 'http://xxx.xxx.xxx/VerifyCodeHandler.ashx';//验证码URL
//从数据库取出用户的COOKIE
$cookieObj = new Cookie();
$cookie = $cookieObj->findCookieByUser($user);
//访问网页
$curl = Curl::create($url, $cookie->toArray())->exec();
//此处直接输出图片
return response($curl->response, 200, ['Content-Length' => strlen($response['body'])])->contentType('image/png');
}
批量访问URL
class Test
{
/**
* @param array $urls
* @param array $cookie
*
* @return array
* Description 传入一个url数组 获取URL内容并返回内容
*/
public function getUrlData(array $urls, array $cookie): array
{
$return_data = []; //定义数据集合
$count = count($urls); //计算循环次数
$ch_arr = []; //定义存储句柄数组
for ($int_for = 0; $int_for < $count; $int_for++) {
$current_url = $urls[$int_for]; //取到url
$ch_arr[$int_for] = curl_init(); //创建句柄
$options = Str::getOptions($current_url, $cookie); //组装curl配置参数
$options[CURLOPT_HEADER] = false;
curl_setopt_array($ch_arr[$int_for], $options); //加载配置参数
curl_multi_add_handle(Curl::getMultiInstance(), $ch_arr[$int_for]); //加载句柄
}
$running = null;
//开始循环执行
do {
curl_multi_exec(Curl::getMultiInstance(), $running);
} while ($running > 0);
for ($int_for = 0; $int_for < $count; $int_for++) {
$return_data[] = curl_multi_getcontent($ch_arr[$int_for]); //存储数据
curl_multi_remove_handle(Curl::getMultiInstance(), $ch_arr[$int_for]); //卸载句柄
}
return $return_data;
}
}
$url_array = [ 'https://www.baidu.com?111', 'https://www.baidu.com?222', 'https://www.baidu.com?333'];
//从数据库取出用户的COOKIE
$cookie =(new Cookie())->findCookieByUser($user);
$result_data = (new Test)->getUrlData($url_array, $cookie);
curl_multi_close(Curl::getMultiInstance());//关闭句柄