𝕷𝖎𝖑𝖏𝖆𝖈𝖐 发表于 2025-1-4 16:30:23

纯php采集周松松最新文章

纯php采集周松松最新文章与邮件订阅发送<?php$url = 'https://zhousongsong.com/'; $ip = rand(0,255).'.'.rand(0,255).'.'.rand(0,255).'.'.rand(0,255) ;      //随机IP$uaagent=["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko)","Mozilla/5.0 (Windows NT 6.1; Win64; x64; +http://url-classification.io/wiki/index.php?title=URL_server_crawler) KStandBot/1.0","Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1","MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1","Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10","Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13","Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+","Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0","NOKIA5700/ UCWEB7.0.2.37/28/999","Openwave/ UCWEB7.0.2.37/28/999","Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999","Mozilla/5.0 (Linux; Android 6.0; 1503-M02 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036558 Safari/537.36 MicroMessenger/6.3.25.861 NetType/WIFI Language/zh_CN"];$randomKey = array_rand($uaagent);$randomUserAgent = $uaagent[$randomKey];//curl封装function fetchurl($url, $headers, $postData = null, $method = 'GET') {$ch = curl_init();curl_setopt($ch, CURLOPT_URL, $url);   curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);if ($postData !== null && $method === 'POST') {curl_setopt($ch, CURLOPT_POST, 1);curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);}if ($method === 'POST') {curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST');} else {curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');}$result = curl_exec($ch);if (curl_errno($ch)) {$error_msg = curl_error($ch);curl_close($ch);throw new Exception("cURL Error: " . $error_msg);}curl_close($ch);return $result;}$headers = array('X-FORWARDED-FOR' =>$ip,'CLIENT-IP' =>$ip,'Refererr'=>'https://www.qq.com/','Accept-Encoding' =>'gzip, deflate','User-Agent' =>$randomUserAgent,);$result = fetchurl($url, $headers);         preg_match_all('/<title>(.*?)<\/title>/', $result, $m);$webname = $m;$webname =substr($webname, 0, strpos($webname, '-'));preg_match_all('/name="description" content="(.*?)"/', $result, $m);$description = $m;if (!extension_loaded('dom')) {die('DOMDocument扩展未加载,请检查PHP配置文件。');}$dom = new DOMDocument();@$dom->loadHTML($result);$newstext = '';$divtext = $dom->getElementById('con_one_1'); //跟随首页模板最新列表模板html改动foreach ($divtext->childNodes as $child) {$newstext .=$child->ownerDocument->saveHTML($child);} $newstext=strip_tags($newstext, "<h1><h2><h3><h5><h6><br><p><a>");@$dom->loadHTML('<?xml encoding="UTF-8">' .$newstext);    $h2Tags = $dom->getElementsByTagName('h2');   $xpath = new DOMXPath($dom);$allinfo = [];//初始化文章列表数组$webinfo=[];//初始化网站信息数组$comment= []; //初始化评论列表数组$newarticleinfo= []; //初始化最新文章信息数组/**获取文章列表循环获取对应数据比如浏览量,标题,内容等字段*/$h2Nodes = $xpath->query('//h2');if ($h2Nodes->length > 0) {   foreach ($h2Nodes as $h2) {    $title = $h2->textContent;    $link = $h2->getElementsByTagName('a')->item(0)->getAttribute('href');    $h6 = $h2->nextSibling;    while ($h6 && $h6->nodeName !== 'h6') {    $h6 = $h6->nextSibling;    }      $h6p = $h6->nextSibling;    while ($h6p&& $h6p->nodeName === 'p') {    $pcontent = $h6p->textContent;   $h6p = $node->nextSibling;   }    $resulttlist = fetchUrl($link, $headers);    $domInner = new DOMDocument();    @$domInner->loadHTML($resulttlist);    $xpathInner = new DOMXPath($domInner);    $nodes = $xpathInner->query('//dd[@class="con"]');//跟随内容模板html改动    if ($nodes->length > 0) {    $conNode = $nodes->item(0);    $newstexts = '';    foreach ($conNode->getElementsByTagName('p') as $pNode) {                if ($pNode->hasAttribute('style')) {            $pNode->removeAttribute('style');      }            $newstexts .= $domInner->saveHTML($pNode);    }    $newstexts = strip_tags($newstexts, '<h1><h2><h3><h5><h6><br><p><a><img>');   $sourceIndex = strpos($newstexts, '来源:');      if ($sourceIndex !== false) {          $newstexts = substr($newstexts, 0, $sourceIndex);      }      $newstexts=preg_replace('/<a[^>]*>(.*?)<\/a>/', '$1',$newstexts);    }    if ($h6 instanceof DOMElement) {      $h6Content = $h6->nodeValue;                preg_match('/\d{4}年\d{2}月\d{2}日/', $h6Content, $matches);      $date = $matches;          preg_match("/浏览:(\d+)/", $h6Content, $matchess);      $views = (int) $matchess;      preg_match("/评论:(\d+)/", $h6Content, $matchesss);      $comments = (int) $matchesss;      } else {      $date = '';      $views = 0;      $comments = 0;    }    }            $allinfo[] = [    'title' => $title,    'link' => $link,    'date' => $date,      'smalltext' => $pcontent,   'onclick' => $views,      'newstext' =>addcslashes($newstexts, '"'),    'plnum' => $comments,    ];}}if ($h2Tags->length > 0) { $firstH2Content = $h2Tags->item(0)->nodeValue; $title="有来自".$webname."的最新文章";$firstH2 = $h2Tags->item(0);   $aTags = $firstH2->getElementsByTagName('a');   $firstA = $aTags->item(0);$linkHref = $firstA->getAttribute('href');$text="".$webname."最新文章标题为《".$firstH2Content."》,地址:<a href='".$linkHref."' target='_blank'>".$linkHref."</a>";$resultt= fetchurl($linkHref, $headers);         @$dom->loadHTML($resultt);$xpath = new DOMXPath($dom);$nodes = $xpath->query('//dd[@class="con"]');//跟随内容模板html改动if ($nodes->length > 0) {      $conNode = $nodes->item(0);      $newstext = '';      foreach ($conNode->getElementsByTagName('p') as $pNode) {            if ($pNode->hasAttribute('style')) {            $pNode->removeAttribute('style');      }      $newstext .= $pNode->ownerDocument->saveHTML($pNode);    }      $sourceIndex = strpos($newstext, '来源:');      if ($sourceIndex !== false) {          $newstext = substr($newstext, 0, $sourceIndex);      }      $newstext=preg_replace('/<a[^>]*>(.*?)<\/a>/', '$1',$newstext);}preg_match('/<span class="commentViewNums">(.*?)<\/span>/',$resultt, $viewNumMatch);$viewNum = $viewNumMatch;$h6Tags = $dom->getElementsByTagName('h6');$h6Tag = $h6Tags->item(0);$h6Text = trim($h6Tag->textContent);preg_match('/\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}/', $h6Text, $matches);$datetime = $matches;$newarticleinfo[] = ['title' => $firstH2Content,'link' => $linkHref,'date' => $datetime,'onclick' =>str_replace('浏览量: ', '', $viewNum),'newstext' => $newstext,];} @$dom->loadHTML($result);$divStatistics = $dom->getElementById('divStatistics'); //跟随首页模板实时数据模板html改动$ul = $divStatistics->getElementsByTagName('ul')->item(0); $totalarticle = ''; // 文章总数$totalplnum = ''; // 评论总数$totalonclick = ''; // 浏览总数foreach ($ul->getElementsByTagName('li') as $li) {      $textContent = trim($li->textContent);      preg_match('/文章总数:\s*(\d+)/', $textContent, $matchescc);   preg_match('/评论总数:\s*(\d+)/', $textContent, $matchesbb);    preg_match('/浏览总数:\s*(\d+)/', $textContent, $matchesaa);    $totalarticle .= $matchescc; // 累加文章总数      $totalplnum .= $matchesbb; // 累加评论总数      $totalonclick .= $matchesaa; // 累加浏览总数}$divComments= $dom->getElementById('divComments'); //跟随首页模板最新评论模板html改动$comments = $divComments->getElementsByTagName('ul')->item(0);foreach ($comments->getElementsByTagName('li') as $li) { $comment[]= trim($li->textContent); }preg_match('/BA号:.*?<\/a>/i', $result, $match);$beianhao = trim(strip_tags($match)); $beianhao = str_replace('BA号:', '', $beianhao);preg_match('/站长QQ:.*?<\/a>/i', $result, $matchh);$QQhao = $matchh; $QQhao= str_replace('站长QQ:', '', $QQhao );$webinfo[]= ['totalarticle' => $totalarticle,'totalplnum' => $totalplnum, 'beianhao' => $beianhao, 'qq' =>(int)$QQhao,'totalonclick' => $totalonclick];$content= array('webname'=>$webname,'newarticleinfo'=>$newarticleinfo,'allinfo'=>$allinfo,'webinfo'=>$webinfo,'comment'=>$comment,'emailmsg'=>$emailmsg,'description'=>$description,'code'=>200,'msg'=>'获取成功');$Json=json_encode($content,JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE);echo stripslashes($Json);输出JSON如下:

婷姐 发表于 2025-1-4 16:30:46

厉害

TyCoding 发表于 2025-1-4 16:31:30

你口味真重啊

拾光 发表于 2025-1-4 16:32:30

这是抛砖引玉,稍微看懂流程的搬运下就能实现微信公众号采集了!正则+php解析DOM

浅生 发表于 2025-1-4 16:32:51

微信公众号的地址怎么获取?

IT618发布 发表于 2025-1-4 16:33:39

浏览器打开不就看到地址了吗

浅生 发表于 2025-1-4 16:34:30

技术杠杠的,厉害。

婷姐 发表于 2025-1-4 16:34:56

$ip = rand(0,255).'.'.rand(0,255).'.'.rand(0,255).'.'.rand(0,255) ;      //随机IP

我突然意识到了什么,最近我的网站貌似被采集了(疑似这种随机伪造IP采集)|
--------------------------------------------------------------------------------------------
难怪我站 收录骤减,昨天还被采集 持续了3个多小时,10多万文章 估计被撸个遍~!

独家记忆 发表于 2025-1-4 16:35:36

这随机ip能起作用吗

浅生 发表于 2025-1-4 16:36:20

这是为了防止松松查看日志伪造的IP访问
页: [1] 2
查看完整版本: 纯php采集周松松最新文章