1. 程式人生 > >java網頁爬蟲正則表示式

java網頁爬蟲正則表示式

package cn.itcast.regextest.demo;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PachongDemo {

	public static void main(String[] args) throws IOException {
		
	List<String>list=demo_2();
	
	
	for(String i:list) {
		System.out.println(i);
		}	
	}
	
	public static List<String> demo_2() throws IOException {
		
		//URL統一資源定位符
		URL url=new URL("https://news.163.com/18/0929/09/DSS2A0NO0001875N.html");
		BufferedReader bur=new BufferedReader(new InputStreamReader(url.openStream()));
		
		List<String>list=new ArrayList<String>();
		
		String regex="南昌";
		Pattern pa=Pattern.compile(regex);
		
		String line=null;
		while((line=bur.readLine())!=null) {
			Matcher m=pa.matcher(line);
			if(m.find()) {
				list.add(m.group());
			}
		}
		bur.close();
		
		return list;
	
	}
	
	//從本地磁碟檔案爬取
	public static List<String> demo_1() throws IOException {
		
		BufferedReader bur=new BufferedReader(new FileReader("demo.txt"));
		
		List<String>list=new ArrayList<String>();
		
		String regex="\\
[email protected]
(\\w+\\.\\w+)+";//正則表示式 Pattern pa=Pattern.compile(regex);//正則物件 String line=null; while((line=bur.readLine())!=null) { Matcher m=pa.matcher(line); if(m.find()) { list.add(m.group()); } } bur.close(); return list; } }