1. 程式人生 > >海量日誌資料,找出出現次數最多的IP地址。

海量日誌資料,找出出現次數最多的IP地址。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

class IP implements Serializable {

	private static final long serialVersionUID = -8903000680469719698L;
	private String ip = "";
	private int count;

	public IP(String ip2, Integer integer) {
		this.ip = ip2;
		this.count = integer;
	}

	public int getCount() {
		return count;
	}

	public String getIp() {
		return ip;
	}

	public void setCount(int count) {
		this.count = count;
	}

	public void setIp(String ip) {
		this.ip = ip;
	}

}

/**
 * 1、海量日誌資料,提取出某日訪問百度次數最多的那個IP。
 * 
 * 首先是這一天,並且是訪問百度的日誌中的IP取出來,逐個寫入到一個大檔案中。注意到IP是32位的,最多有個2^32個IP。同樣可以採用對映的方法,
 * 比如模1000
 * ,把整個大檔案對映為1000個小檔案,再找出每個小文中出現頻率最大的IP(可以採用hash_map進行頻率統計,然後再找出頻率最大的幾個)及相應的頻率
 * 。然後再在這1000個最大的IP中,找出那個頻率最大的IP
 * 
 * 
 */
public class No2 {
	static String fileLoc = "D:\\bigdata_ip.txt";

	public static void findIp() throws IOException, ClassNotFoundException {
		long start = System.currentTimeMillis();
		hashToSmallFiles();
		long end1 = System.currentTimeMillis();
		System.out.println("將大檔案對映成小檔案,用時:" + (end1 - start) + "毫秒");

		System.out.println("對映到小檔案完成,開始統計每個小檔案中出現頻率最高的ip");
		long start1 = System.currentTimeMillis();
		List<IP> list = countEverySmallFile();
		long end2 = System.currentTimeMillis();
		System.out.println("統計所有檔案共用時:" + (end2 - start1) + " 毫秒");

		System.out.println("統計完成,開始計算所有ip中出現頻率最高的ip");
		IP ip = calculateResult(list);
		System.out.println("訪問次數最多的ip是:" + ip.getIp() + ":" + ip.getCount());
		long end = System.currentTimeMillis();
		System.out.println("公用時:" + (end - start) + "毫秒");
	}

	/**
	 * 從每個檔案出現頻率最高ip中,計算出所有檔案中出現頻率最高ip。
	 * 
	 * @param list
	 */
	private static IP calculateResult(List<IP> list) {
		IP[] ips = new IP[list.size()];
		ips = list.toArray(ips);
		int max = 0;
		for (int j = 1; j < ips.length; j++) {
			if (ips[j].getCount() > ips[max].getCount()) {
				max = j;
			}
		}
		return ips[max];
	}

	/**
	 * 統計生成的每一個小檔案,返回一個List,這個List的每一項就是每個小檔案的統計結果,即每個小檔案中出現頻率最高的ip和出現次數
	 * 
	 * @return
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	private static List<IP> countEverySmallFile() throws FileNotFoundException, IOException {
		List<IP> list = new ArrayList<IP>();
		for (int i = 0; i < 1024; i++) {
			File file = new File(fileLoc + i + ".txt");
			if (file.exists()) {
				long startTime = System.currentTimeMillis();
				BufferedReader br1 = new BufferedReader(new FileReader(file));
				String ip1 = "";
				HashMap<String, Integer> hm = new HashMap<String, Integer>();
				while ((ip1 = br1.readLine()) != null) {
					if (!hm.containsKey(ip1)) {
						hm.put(ip1, 1);
					} else {
						hm.put(ip1, hm.get(ip1) + 1);
					}
				}

				IP[] ips = new IP[hm.size()];
				int index = 0;
				for (String temp : hm.keySet()) {
					ips[index] = new IP(temp, hm.get(temp));
					index++;
				}
				int max = 0;
				for (int j = 1; j < ips.length; j++) {
					if (ips[j].getCount() > ips[max].getCount()) {
						max = j;
					}
				}
				list.add(ips[max]);
				long endTime = System.currentTimeMillis();
				System.out.println("已經統計檔案:" + fileLoc + i + ".txt,用時:" + (endTime - startTime) + " 毫秒");
			}
		}
		return list;
	}

	/**
	 * 將打檔案hash成1024個小檔案
	 * 
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	private static void hashToSmallFiles() throws FileNotFoundException, IOException {
		BufferedReader br = new BufferedReader(new FileReader(fileLoc));
		String ip = "";
		HashMap<String, FileWriter> fileWriters = new HashMap<String, FileWriter>();
		while ((ip = br.readLine()) != null) {
			int tmp = Math.abs(ip.hashCode() % 1024);
			String fileName = fileLoc + tmp + ".txt";
			FileWriter fw = null;
			if (fileWriters.containsKey(fileName)) {
				fw = fileWriters.get(fileName);
			} else {
				fw = new FileWriter(fileName, true);
				fileWriters.put(fileName, fw);
			}
			fw.write(ip + "\n");
		}
		br.close();
		for (FileWriter ff : fileWriters.values()) {
			ff.close();
		}
	}

	/**
	 * 隨機生成ip地址,生成大文字檔案
	 * 
	 * @throws IOException
	 */
	private static void generateFile() throws IOException {
		FileWriter fw = new FileWriter(fileLoc, true);
		for (int i = 0; i < 100000000; i++) {
			for (int j = 0; j < 100000000; j++) {
				fw.write(generateIp() + "\n");
			}
		}
		fw.close();
		System.out.println("done");
	}

	/**
	 * 隨機生成ip地址
	 * 
	 * @return
	 */
	private static String generateIp() {
		String ip = "";
		for (int i = 0; i < 4; i++) {
			int temp = (int) (Math.random() * 255);
			ip += temp + ".";
		}
		return ip.substring(0, ip.length() - 1);
	}

	public static void main(String[] args) {
		try {
			findIp();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}