1. 程式人生 > >簡單使用phpspider采集本博客文章內容

簡單使用phpspider采集本博客文章內容

什麽 days cto close sig fopen bsp use Coding

采集流程

根據鏈接獲取頁面內容(curl)->獲取需要采集的內容(可以通過正則、xpath、css選擇器等方法進行篩選)

<?php

require_once ‘phpspider/autoloader.php‘;
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* 不要刪除這段註釋 */
requests::$input_encoding = ‘GB2312‘;
requests
::$output_encoding = ‘GB2312‘; //獲取博客文章列表的文章url for($i=1;$i<=10;$i++){ $url = "https://www.cnblogs.com/jcydd/default.html?page=".$i; $html = requests::get($url); //var_dump($html); $selector = "/<a\sid=\"homepage1_HomePageDays_DaysList_ctl0\d_DayList_TitleUrl_\d\"\sclass=\"postTitle2\"\shref=\"(.*)\">/";
$result[] = selector::select($html, $selector,‘regex‘); } //var_dump($result); //根據url循環獲取文章標題和內容 foreach($result as $k=> $v){ foreach($v as $kk=>$vv){ $html1 = requests::get($vv); //var_dump($html1); //獲取文章標題,正則表達式前後要加上@,我也不知道為什麽 $selector1 = "@<a\sid=\"cb_post_title_url\"\s(?:.)+?>(.*)</a>@";
$result1 = selector::select($html1, $selector1,‘regex‘); //var_dump($result1); //文章內容正則有點問題,有些內容獲取不到 $selector2 = "@<div\sid=\"post_body\"\sclass=\"blogpost-body\">((.|\n)*)<div\sid=\"MySignature\">@"; $result2 = selector::select($html1, $selector2,‘regex‘); //var_dump($result2); //去除文章內容裏的html標簽 $result2=preg_replace(‘/<[^<]*>/‘,"",$result2); //寫入文件 $myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!"); fwrite($myfile, $result1); fwrite($myfile,"\r\n"); foreach($result2 as $vvv){ fwrite($myfile, $vvv); } fclose($myfile); } }

簡單使用phpspider采集本博客文章內容