1. 程式人生 > >Java 實現按行讀取檔案並且將行中的重複資料刪除

Java 實現按行讀取檔案並且將行中的重複資料刪除

package com.gaden.delerepeat;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.TreeSet;

import com.gaden.Transilate.WriteFile;

public class DeleRepeat {
 
 public static void main(String[] args ) throws Exception {
  if(args.length==2){
   String inputPath = args[0];
   String outputPath = args[1];
   String content = readFileDeleReapet(inputPath);
   WriteFile.writeFile(outputPath,content);
  }else{
   System.out.println("Error args!");
  }
  
 }
 
 public static String Delerepeat(String line){
  String ls="";
  line = line.replaceAll("[\\pP‘’“”]", ""); //把所有標點符號都去掉
  line = line.replaceAll("\\s{2

,}", " "); //將多個空格換成一個
  String[] str = line.trim().split(" ");
  TreeSet<String> set = new TreeSet<String>();
  for (int i = 0; i < str.length; i++) {
   set.add(str[i]);
   }
  str = (String[]) set.toArray(new String[0]);
  for (int i = 0; i < str.length; i++) {
    ls += str[i]+" ";
   }
   System.out.println(ls);
  return ls;
 }
 
 public static String readFileDeleReapet(String filePathAndName) {
  String fileContent = "";
  try {
   File f = new File(filePathAndName);
   if (f.isFile() && f.exists()) {
    InputStreamReader read = new InputStreamReader(
      new FileInputStream(f), "UTF-8");
    BufferedReader reader = new BufferedReader(read);
    String line;
    while ((line = reader.readLine()) != null) {
     fileContent += Delerepeat(line)+"\r\n";
    }
    read.close();
   }
  } catch (Exception e) {
   System.out.println("du wenjian cuo wu");
   e.printStackTrace();
  }
  return fileContent;
 }


}

匹配URL:

regex_luo = re.compile(
        r'[(?:http|ftp)s?://]?' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?:\d+)?' # optional por
 r'(?:/[a-zA-Z0-9\&%_\./-~-]*)?', re.IGNORECASE)