pandas 處理資料一(抽取特定URL,正則匹配)
阿新 • • 發佈:2018-11-16
主要是想查詢第一個url檔案中的url在第二個檔案中url中有多少個和它匹配。
第一個檔案截圖(共23個特徵資料):
第二個檔案截圖,共6萬多URL資料:
結果截圖:
import pandas as pd import numpy as np df = pd.read_csv('cluster_all.csv') #一次性刪除指定列 x=[0,2] df.drop(df.columns[x], axis=1, inplace=True) df.columns = ['url','y_pre_20'] #去重 temp_df = df.drop_duplicates(subset=['url'], keep='first', inplace=False)#以url刪除重複行 #轉換成df格式的資料 data = pd.DataFrame(temp_df) df_notRepetion = data.sort_values(["y_pre_20"],ascending=True)#以y_pre_20列排序 s_noRepetion_url = df_notRepetion.url s_notRepetion = pd.Series(s_noRepetion_url) s_allUrl = df.url s_all = pd.Series(s_allUrl) feature_df = pd.read_csv('url.csv') fea = feature_df.URL s_fea = pd.Series(fea) url_fea_list = s_fea.tolist() url_all_list = s_all.tolist() url_notRepetion_list = s_notRepetion.tolist() count_dict = {} import re for var in url_fea_list: k=0 for i in url_all_list: #str_list = var.split('/') str_i_list = i.split(' ') #var_str = " ".join(str_list) i_str = "/".join(str_i_list) s = r'.*?'+var+'.*' ifmatch = re.search(pattern=s, string=i_str) if ifmatch: k=k+1 count_dict.update(var=k) print(var,k)