纯php采集周松松最新文章

𝕷𝖎𝖑𝖏𝖆𝖈𝖐 发表于 2025-1-4 16:30:23

纯php采集周松松最新文章与邮件订阅发送<?php$url = 'https://zhousongsong.com/'; $ip = rand(0,255).'.'.rand(0,255).'.'.rand(0,255).'.'.rand(0,255) ; //随机IP$uaagent=["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko)","Mozilla/5.0 (Windows NT 6.1; Win64; x64; +http://url-classification.io/wiki/index.php?title=URL_server_crawler) KStandBot/1.0","Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1","MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1","Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10","Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13","Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+","Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0","NOKIA5700/ UCWEB7.0.2.37/28/999","Openwave/ UCWEB7.0.2.37/28/999","Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999","Mozilla/5.0 (Linux; Android 6.0; 1503-M02 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036558 Safari/537.36 MicroMessenger/6.3.25.861 NetType/WIFI Language/zh_CN"];$randomKey = array_rand($uaagent);$randomUserAgent = $uaagent[$randomKey];//curl封装function fetchurl($url, $headers, $postData = null, $method = 'GET') {$ch = curl_init();curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);if ($postData !== null && $method === 'POST') {curl_setopt($ch, CURLOPT_POST, 1);curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);}if ($method === 'POST') {curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST');} else {curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');}$result = curl_exec($ch);if (curl_errno($ch)) {$error_msg = curl_error($ch);curl_close($ch);throw new Exception("cURL Error: " . $error_msg);}curl_close($ch);return $result;}$headers = array('X-FORWARDED-FOR' =>$ip,'CLIENT-IP' =>$ip,'Refererr'=>'https://www.qq.com/','Accept-Encoding' =>'gzip, deflate','User-Agent' =>$randomUserAgent,);$result = fetchurl($url, $headers); preg_match_all('/<title>(.*?)<\/title>/', $result, $m);$webname = $m;$webname =substr($webname, 0, strpos($webname, '-'));preg_match_all('/name="description" content="(.*?)"/', $result, $m);$description = $m;if (!extension_loaded('dom')) {die('DOMDocument扩展未加载，请检查PHP配置文件。');}$dom = new DOMDocument();@$dom->loadHTML($result);$newstext = '';$divtext = $dom->getElementById('con_one_1'); //跟随首页模板最新列表模板html改动foreach ($divtext->childNodes as $child) {$newstext .=$child->ownerDocument->saveHTML($child);} $newstext=strip_tags($newstext, "<h1><h2><h3><h5><h6><br><p><a>");@$dom->loadHTML('<?xml encoding="UTF-8">' .$newstext); $h2Tags = $dom->getElementsByTagName('h2'); $xpath = new DOMXPath($dom);$allinfo = [];//初始化文章列表数组$webinfo=[];//初始化网站信息数组$comment= []; //初始化评论列表数组$newarticleinfo= []; //初始化最新文章信息数组/**获取文章列表循环获取对应数据比如浏览量，标题，内容等字段*/$h2Nodes = $xpath->query('//h2');if ($h2Nodes->length > 0) { foreach ($h2Nodes as $h2) { $title = $h2->textContent; $link = $h2->getElementsByTagName('a')->item(0)->getAttribute('href'); $h6 = $h2->nextSibling; while ($h6 && $h6->nodeName !== 'h6') { $h6 = $h6->nextSibling; } $h6p = $h6->nextSibling; while ($h6p&& $h6p->nodeName === 'p') { $pcontent = $h6p->textContent; $h6p = $node->nextSibling; } $resulttlist = fetchUrl($link, $headers); $domInner = new DOMDocument(); @$domInner->loadHTML($resulttlist); $xpathInner = new DOMXPath($domInner); $nodes = $xpathInner->query('//dd[@class="con"]');//跟随内容模板html改动 if ($nodes->length > 0) { $conNode = $nodes->item(0); $newstexts = ''; foreach ($conNode->getElementsByTagName('p') as $pNode) { if ($pNode->hasAttribute('style')) { $pNode->removeAttribute('style'); } $newstexts .= $domInner->saveHTML($pNode); } $newstexts = strip_tags($newstexts, '<h1><h2><h3><h5><h6><br><p><a><img>'); $sourceIndex = strpos($newstexts, '来源：'); if ($sourceIndex !== false) { $newstexts = substr($newstexts, 0, $sourceIndex); } $newstexts=preg_replace('/<a[^>]*>(.*?)<\/a>/', '$1',$newstexts); } if ($h6 instanceof DOMElement) { $h6Content = $h6->nodeValue; preg_match('/\d{4}年\d{2}月\d{2}日/', $h6Content, $matches); $date = $matches; preg_match("/浏览:(\d+)/", $h6Content, $matchess); $views = (int) $matchess; preg_match("/评论:(\d+)/", $h6Content, $matchesss); $comments = (int) $matchesss; } else { $date = ''; $views = 0; $comments = 0; } } $allinfo[] = [ 'title' => $title, 'link' => $link, 'date' => $date, 'smalltext' => $pcontent, 'onclick' => $views, 'newstext' =>addcslashes($newstexts, '"'), 'plnum' => $comments, ];}}if ($h2Tags->length > 0) { $firstH2Content = $h2Tags->item(0)->nodeValue; $title="有来自".$webname."的最新文章";$firstH2 = $h2Tags->item(0); $aTags = $firstH2->getElementsByTagName('a'); $firstA = $aTags->item(0);$linkHref = $firstA->getAttribute('href');$text="".$webname."最新文章标题为《".$firstH2Content."》，地址：<a href='".$linkHref."' target='_blank'>".$linkHref."</a>";$resultt= fetchurl($linkHref, $headers); @$dom->loadHTML($resultt);$xpath = new DOMXPath($dom);$nodes = $xpath->query('//dd[@class="con"]');//跟随内容模板html改动if ($nodes->length > 0) { $conNode = $nodes->item(0); $newstext = ''; foreach ($conNode->getElementsByTagName('p') as $pNode) { if ($pNode->hasAttribute('style')) { $pNode->removeAttribute('style'); } $newstext .= $pNode->ownerDocument->saveHTML($pNode); } $sourceIndex = strpos($newstext, '来源：'); if ($sourceIndex !== false) { $newstext = substr($newstext, 0, $sourceIndex); } $newstext=preg_replace('/<a[^>]*>(.*?)<\/a>/', '$1',$newstext);}preg_match('/<span class="commentViewNums">(.*?)<\/span>/',$resultt, $viewNumMatch);$viewNum = $viewNumMatch;$h6Tags = $dom->getElementsByTagName('h6');$h6Tag = $h6Tags->item(0);$h6Text = trim($h6Tag->textContent);preg_match('/\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}/', $h6Text, $matches);$datetime = $matches;$newarticleinfo[] = ['title' => $firstH2Content,'link' => $linkHref,'date' => $datetime,'onclick' =>str_replace('浏览量: ', '', $viewNum),'newstext' => $newstext,];} @$dom->loadHTML($result);$divStatistics = $dom->getElementById('divStatistics'); //跟随首页模板实时数据模板html改动$ul = $divStatistics->getElementsByTagName('ul')->item(0); $totalarticle = ''; // 文章总数$totalplnum = ''; // 评论总数$totalonclick = ''; // 浏览总数foreach ($ul->getElementsByTagName('li') as $li) { $textContent = trim($li->textContent); preg_match('/文章总数:\s*(\d+)/', $textContent, $matchescc); preg_match('/评论总数:\s*(\d+)/', $textContent, $matchesbb); preg_match('/浏览总数:\s*(\d+)/', $textContent, $matchesaa); $totalarticle .= $matchescc; // 累加文章总数 $totalplnum .= $matchesbb; // 累加评论总数 $totalonclick .= $matchesaa; // 累加浏览总数}$divComments= $dom->getElementById('divComments'); //跟随首页模板最新评论模板html改动$comments = $divComments->getElementsByTagName('ul')->item(0);foreach ($comments->getElementsByTagName('li') as $li) { $comment[]= trim($li->textContent); }preg_match('/BA号：.*?<\/a>/i', $result, $match);$beianhao = trim(strip_tags($match)); $beianhao = str_replace('BA号：', '', $beianhao);preg_match('/站长QQ：.*?<\/a>/i', $result, $matchh);$QQhao = $matchh; $QQhao= str_replace('站长QQ：', '', $QQhao );$webinfo[]= ['totalarticle' => $totalarticle,'totalplnum' => $totalplnum, 'beianhao' => $beianhao, 'qq' =>(int)$QQhao,'totalonclick' => $totalonclick];$content= array('webname'=>$webname,'newarticleinfo'=>$newarticleinfo,'allinfo'=>$allinfo,'webinfo'=>$webinfo,'comment'=>$comment,'emailmsg'=>$emailmsg,'description'=>$description,'code'=>200,'msg'=>'获取成功');$Json=json_encode($content,JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE);echo stripslashes($Json);输出JSON如下：

婷姐发表于 2025-1-4 16:30:46

厉害

TyCoding 发表于 2025-1-4 16:31:30

你口味真重啊

拾光发表于 2025-1-4 16:32:30

这是抛砖引玉，稍微看懂流程的搬运下就能实现微信公众号采集了！正则+php解析DOM

浅生发表于 2025-1-4 16:32:51

微信公众号的地址怎么获取？

IT618发布 发表于 2025-1-4 16:33:39

浏览器打开不就看到地址了吗

浅生发表于 2025-1-4 16:34:30

技术杠杠的，厉害。

婷姐发表于 2025-1-4 16:34:56

$ip = rand(0,255).'.'.rand(0,255).'.'.rand(0,255).'.'.rand(0,255) ; //随机IP

我突然意识到了什么，最近我的网站貌似被采集了（疑似这种随机伪造IP采集）|
--------------------------------------------------------------------------------------------
难怪我站收录骤减，昨天还被采集持续了3个多小时，10多万文章估计被撸个遍~！

独家记忆 发表于 2025-1-4 16:35:36

这随机ip能起作用吗

浅生发表于 2025-1-4 16:36:20

这是为了防止松松查看日志伪造的IP访问

页: [1] 2

DZ插件网's Archiver

纯php采集周松松最新文章