1. 程式人生 > >爬蟲系列 一次採集.NET WebForm網站的坎坷歷程

爬蟲系列 一次採集.NET WebForm網站的坎坷歷程

 

今天接到一個活,需要統計人員的工號資訊,由於種種原因不能直接連資料庫 [無奈]、[無奈]、[無奈]。採取迂迴方案,寫個工具自動登入網站,採集使用者資訊。

這也不是第一次採集ASP.NET網站,以前採集的時候就知道,這種網站採集比較麻煩,尤其是WebForm的ASP.NET 網站,那叫一個費勁。

喜歡現在流行的Restful模式的網站,資料介面採集那才叫舒服。

閒話少說,開幹

工作量不大,HTTP純手寫

先準備下一個GET/POST預備使用

    public static string Get(string url, Action<string> SuccessCallback, Action<string> FailCallback) {
            HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
            req.Method = "GET";
            req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
            req.Accept = "*/*";
            req.KeepAlive = true;
            req.ServicePoint.ConnectionLimit = int.MaxValue;
            req.ServicePoint.Expect100Continue = false;
            req.CookieContainer = sznyCookie; #靜態變數
            req.Credentials = System.Net.CredentialCache.DefaultCredentials;
            string msg = "";
            using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse)
            {
                using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
                {
                    msg = reader.ReadToEnd();
                }
            }
            return msg;
        }
 
    public static string Post(string url, Dictionary<string, string> dicParms, Action<string> SuccessCallback, Action<string> FailCallback) {
            StringBuilder data = new StringBuilder();
            foreach (var kv in dicParms) {
                if (kv.Key.StartsWith("header"))
                    continue;
                data.Append($"&{Common.UrlEncode( kv.Key,Encoding.UTF8)}={ Common.UrlEncode( kv.Value,Encoding.UTF8)}");
            }
            if (data.Length > 0)
                data.Remove(0, 1);
            HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
            req.Method = "POST";
            req.KeepAlive = true;
            req.CookieContainer = sznyCookie;
            req.Connection = "KeepAlive";
            req.KeepAlive = true;
            req.ContentType = "application/x-www-form-urlencoded";
            req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
            req.Referer = url;
            if (dicParms.ContainsKey("ScriptManager1"))
            {
                req.Headers.Add("X-MicrosoftAjax", "Delta=true");
                req.Headers.Add("X-Requested-With", "XMLHttpRequest");
                req.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
                req.Accept = "*/*";
            }
            req.Headers.Add("Cache-Control", "no-cache");
            
            req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
            req.ServicePoint.ConnectionLimit = int.MaxValue;
            req.ServicePoint.Expect100Continue = false;
            req.AllowAutoRedirect = true;
            req.Credentials = System.Net.CredentialCache.DefaultCredentials;

            byte[] buffer = Encoding.UTF8.GetBytes(data.ToString());
            using (Stream reqStream = req.GetRequestStream())
            {
                reqStream.Write(buffer, 0, buffer.Length);
            }
            string msg = "";
            using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse)
            {
                using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
                {
                    
                    msg = reader.ReadToEnd();
                    if (msg.Contains("images/dl.jpg") || msg.Contains("pageRedirect||%2flogin.aspx"))
                    {
                        //登入失敗
                        if (FailCallback != null)
                            FailCallback(msg);
                    }
                    else {
                        if (SuccessCallback!=null)
                            SuccessCallback(msg);
                    }
                }
            }
            return msg;
        }

 

整個過程分為登陸、使用者資訊列表、使用者資訊詳情,分三步走來完成這個專案

登陸

根據Chrome抓包結果編寫Login,帳號密碼沒有任何加密,直接明文顯示了,直接用了,根據是否跳轉頁面判斷是否登陸成功。除錯檢視結果登陸成功了。

 

 

根據上面的抓包資料,可以呼叫下面的程式碼確定是否登陸成功。

      public static bool SznyLogin(string username, string password, Action<string> SuccessCallback, Action<string> FailCallback) {
            string url = "http://127.0.0.1/login.aspx";
            string msg = Get(url, SuccessCallback, FailCallback);
            if (msg.Trim().Length > 0) {
                Dictionary<string, string> dicParms = new Dictionary<string, string>();
                dicParms.Add("__VIEWSTATE", "");
                dicParms.Add("__EVENTVALIDATION", "");
                dicParms.Add("Text_Name", "");
                dicParms.Add("Text_Pass", "");
                dicParms.Add("btn_Login.x", new Random().Next(100).ToString());
                dicParms.Add("btn_Login.y", new Random().Next(200).ToString());
                MatchCollection mc = Regex.Matches(msg, @"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                foreach (Match mi in mc)
                {
                    if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
                        dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
                }
                dicParms["Text_Name"] = username;
                dicParms["Text_Pass"] = password;

                msg=Post(url, dicParms, SuccessCallback, FailCallback);

                if (msg.Contains("images/dl.jpg") || msg.Contains("pageRedirect||%2flogin.aspx"))
                {
                    return false;
                }
                else
                    return true;
            }
            return false;
        }

抓取人員資訊

看到下面這個頁面,失望了,列表上沒有工號,如果列表上有工號 設定一頁顯示全部資訊就可以把所有的資料都抓取到了。

 

 

 

換個思路:是不是我直接設定一頁顯示所有的資料後,然後根據員工ID可以獲取到所有的資訊呢?

接下來點選任意一條資訊後,檢視詳情,顯示下面的呼叫結果。Url上沒有ID,Get這條路走不通了,檢視Post的資料,更失望,沒有ID,通過行資訊繫結。傳統的WebForm 提交模式…

 

把所有的資料顯示到一頁,把列表的資料先採集完,然後最後一個頁面一個頁面的採集工號資訊。

    public static CookieContainer sznyCookie = new CookieContainer();
        /// <summary>
        /// 員工資訊
        /// </summary>
        public static Dictionary<int, Dictionary<string,string>> dicSznyEmployees = new Dictionary<int, Dictionary<string, string>>();

        public static Dictionary<string, string> dicSznyEmployeeParms = new Dictionary<string, string>();
        /// <summary>
        /// 人員順序號
        /// </summary>
        public static ConcurrentQueue<int> queueSznyEmployeeInfo = new ConcurrentQueue<int>();

        public static ConcurrentQueue<int> queueSuccessEmployeeInfo = new ConcurrentQueue<int>();
 
    public static bool SznyEmployeeList(Action<string> SuccessCallback, Action<string> FailCallback)
        {
            string url = $"http://127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx";
            string msg = Get(url, SuccessCallback, FailCallback);
            if (msg.Trim().Length > 100)
            {
                //統計引數
                //__doPostBack\('(?<name>[^']*?)'
                //new Regex(@"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                string name = "";
                MatchCollection mc = Regex.Matches(msg, @"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                foreach (Match mi in mc)
                {
                    name = mi.Groups["name"].Value.Trim();
                    break;
                }
                //(?<=<a[^<>]*?href="javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)"[^<>]*?>條/頁)
                //new Regex(@"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                string smname = "";
                Match m = Regex.Match(msg, @"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                if (m.Success)
                    smname = m.Value.Trim().Replace("'", "").Replace("&#39;", "");

                //<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?value="(?<val>[^"]*?)"[^<>]*?/?>|<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>|<select[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>
                //new Regex(@"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                Dictionary<string, string> dicParms = new Dictionary<string, string>();
                dicParms.Add("ScriptManager1", $"UpdatePanel1|{smname}");
                dicParms.Add("__EVENTTARGET", smname);
                dicParms.Add("__EVENTARGUMENT", "");
                dicParms.Add("__VIEWSTATE", "");
                dicParms.Add("__EVENTVALIDATION", "");
                dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
                dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", "1");
                dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1");
                dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1");
                dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1");
                dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1");
                dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0");
                dicParms.Add("XM", "ZXMCHECK");

                List<string> lstParms = new List<string>() { "XM", "MdGridView_t_unitemployees_dwyg_iCurrentPage", "MdGridView_t_unitemployees_dwyg_GridViewID", "MdGridView_t_unitemployees_dwyg_iCurrentNum", "MdGridView_t_unitemployees_dwyg_iPageCount", "MdGridView_t_unitemployees_dwyg_iPageSize", "Button_Query", "__EVENTTARGET", "__EVENTARGUMENT", "Button_SelQuery", "Button_view", "Button_edit", "Button_out", "ImageButton_Tx", "ImageButton_xx1", "Button_qd", "MdGridView_t_unitemployees_dwyg_GridViewID", "__ASYNCPOST", "MdGridView_t_unitemployees_dwyg__PageSetText" };
                mc = Regex.Matches(msg, @"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                foreach (Match mi in mc)
                {

                    if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
                        continue;

                    if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
                        dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
                    else
                        dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
                }
                if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_PageSetText"))
                    dicParms["MdGridView_t_unitemployees_dwyg$_PageSetText"] = "1200";
                else
                    dicParms.Add("MdGridView_t_unitemployees_dwyg$_PageSetText", "100");//1200條 每頁

                msg = Post(url, dicParms, SuccessCallback, FailCallback);

                dicSznyEmployees.Clear();
                dicSznyEmployeeParms.Clear();
                dicSznyEmployeeParms.Clear();
                dicSznyEmployeeParms.Add("__EVENTTARGET", "");
                dicSznyEmployeeParms.Add("__EVENTARGUMENT", "");
                dicSznyEmployeeParms.Add("__VIEWSTATE", dicParms["__VIEWSTATE"]);
                dicSznyEmployeeParms.Add("__EVENTVALIDATION", dicParms["__EVENTVALIDATION"]);
                dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
                dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", "1");
                dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1");
                dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1");
                dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1");
                dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1");
                dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0");
                dicSznyEmployeeParms.Add("XM", "ZXMCHECK");
                
                lstParms.Clear();
                lstParms = new List<string>() { "XM", "__EVENTTARGET", "__EVENTARGUMENT", "Button_Query", "Button_SelQuery", };
                lstParms.Add("Button_edit");
                lstParms.Add("Button_out");
                lstParms.Add("ImageButton_Tx");
                lstParms.Add("ImageButton_xx1");
                lstParms.Add("Button_qd");
                lstParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID");
                lstparms.add("mdgridview_t_unitemployees_dwyg_icurrentpage");
                lstparms.add("mdgridview_t_unitemployees_dwyg_itotalpage");
                lstparms.add("mdgridview_t_unitemployees_dwyg_itotalcount");
                lstparms.add("mdgridview_t_unitemployees_dwyg_ipagesize");
                lstparms.add("mdgridview_t_unitemployees_dwyg_ipagecount");
                lstParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum");

                mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                foreach (Match mi in mc)
                {

                    if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
                        continue;

                    if (dicSznyEmployeeParms.ContainsKey(mi.Groups["name"].Value.Trim()))
                        dicSznyEmployeeParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
                    else
                        dicSznyEmployeeParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
                }
                int cnt = int.Parse(dicSznyEmployeeParms["MdGridView_t_unitemployees_dwyg_iTotalCount"]);
                for (int i = 1; i <= cnt; i++)
                    queueSznyEmployeeInfo.Enqueue(i);

                //獲取TR
                //< tr[^<>] *? name = "SelectTR"[^<>] *?>.*?</ tr >
                //new Regex(@"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                mc = Regex.Matches(msg, @"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                foreach (Match mi in mc)
                {
                    //獲取td
                    //(?<=<td[^<>]*?>).*?(?=</td>)
                    //new Regex("(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                    MatchCollection mic = Regex.Matches(mi.Value, "(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                    int ix = int.Parse(mic[1].Value.Trim());
                    if (!dicSznyEmployees.ContainsKey(ix))
                    {
                        dicSznyEmployees.Add(ix, new Dictionary<string, string>());
                    }
                    queueSznyEmployeeInfo.Enqueue(ix);

                    dicSznyEmployees[ix].Add("UserName", mic[2].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("PersonID", mic[3].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("Birthday", mic[4].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("Sex", mic[5].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("HomePhone", mic[6].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("TelPhone", mic[7].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("Mail", mic[8].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("Address", mic[9].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("MinZu", mic[10].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("AddressJiGuan", mic[11].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("ZhengZhiMianmao", mic[12].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("Paiqianshijian", mic[13].Value.Trim().Replace("&nbsp;", ""));
                    dicSznyEmployees[ix].Add("Remark", mic[14].Value.Trim().Replace("&nbsp;", ""));
                }
            }
            return true;
        }

這樣所有的人員資訊一次性採集到靜態變數字典中了,剩下的一個工號可以慢慢獲取了。

 

既然是這樣,老實的分析Post資料,按照格式Post資料把。

分析完Post的資料後,突發奇想,我是不是可以通過相同的__ViewState和__EVENTVALIDATION POST資料呢?說幹就幹。

寫程式碼跳轉到員工列表頁面,然後POST資料設定一頁顯示所有資料。

所有的POST的引數,儲存到一個靜態變數中。

發現POST批量提交的時候,前3次正常,以後就直接未登入。

 

果斷放棄,換思路。

那如果這樣不行 可不可以把所有的資料放到一個頁面上,然後每次獲取一次頁面,然後根據順序號POST資料呢。

上面已經把所有的列表資料都採集完了,順序號也固定了,然後在POST資料的時候,發現有的人員和工號不對應。

這時候去分析為什麼資料會出現不對應的情況呢?發現正則表示式寫的還有問題。獲取頁面的Input的時候,屬性有可能使用雙引號,也有可能使用單引號。

正則表示式由原來的

<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?value="(?<val>[^"]*?)"[^<>]*?/?>|<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>|<select[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>

修改為

<input[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?value=["'](?<val>[^'"]*?)["'][^<>]*?/?>|<input[^<>]*?name=["'](?<name>[^'"]*?)["'][^<>]*?["'][^<>]*?/?>|<select[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?["'][^<>]*?/?>

由於網站非同步提交,也就是以前WEBForm採用的ScriptManager,提交的時候返回的HTML不是整個Document,沒有注意,以為沒有返回__ViewState。所以採用GET的時候獲取的__ViewState繼續執行獲取工號的操作。發現獲取的工號都是錯誤,人員與工號對不上

麻爪了,不知道該咋辦了。猶豫了一下後,上Fiddler吧,一點點的看提交的引數是否有區別。發現正常網站在Get到頁面後,通過調整每頁x條資料後,提交的ViewState與原來的不一致。尋尋覓覓 覓覓尋尋  最後發現非同步返回的HTML中,最後有ViewState….

由於返回的資料順序,每次也不一樣,也是造成人員、工號不一致的原因。

 

提交後正常了,但是1000多條的員工資訊,每次提交都是2000多個引數。看著冗長的POST資料,無語了。這樣提交 先不說網站本身就慢。我提交這麼多網站會不會更慢,我的系統是不是也會更慢。

怎麼辦?

是不是有可能把分頁設定成每頁只有一條資料,然後每次翻頁,採集資料。簡單試試把

先修改獲取列表頁面資料,把資料設定成一條每頁,此時不再採集列表中的資訊。而是記錄總共多少頁,放入佇列中,共定時任務去分頁採集資料。列表資訊通過後面的分頁資料採集。

由於網站是內部系統,為了不影響系統的正常執行,每次只採集一條資訊,等待這條資訊採集完成後,在採集下一頁資訊。

採集列表

     public static void ReqSznyEmployeeList(int ix,Action<string> SuccessCallback, Action<string> FailCallback)
        {

            string url = $"http://127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx";
            HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
            req.Method = "GET";
            req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
            req.Accept = "*/*";
            req.KeepAlive = true;
            req.ServicePoint.ConnectionLimit = int.MaxValue;
            req.ServicePoint.Expect100Continue = false;
            req.CookieContainer = sznyCookie;
            req.Credentials = System.Net.CredentialCache.DefaultCredentials;

            req.BeginGetResponse(new AsyncCallback(RspSznyEmployeeList), new object[] { req, url,ix, SuccessCallback, FailCallback });
        }



     private static void RspSznyEmployeeList(IAsyncResult result)
        {
            object[] parms = result.AsyncState as object[];
            HttpWebRequest req = parms[0] as HttpWebRequest;
            string url = parms[1].ToString();
            int ix = int.Parse(parms[2].ToString());
            Action<string> SuccessCallback = parms[3] as Action<string>;
            Action<string> FailCallback = parms[4] as Action<string>;
            try
            {
                using (HttpWebResponse rsp = req.EndGetResponse(result) as HttpWebResponse)
                {
                    using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
                    {
                        string msg = "";
                        msg = reader.ReadToEnd();
                        //統計引數
                        //__doPostBack\('(?<name>[^']*?)'
                        //new Regex(@"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                        string name = "";
                        MatchCollection mc = Regex.Matches(msg, @"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                        foreach (Match mi in mc)
                        {
                            name = mi.Groups["name"].Value.Trim();
                            break;
                        }

                        //(?<=<a[^<>]*?href="javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)"[^<>]*?>條/頁)
                        //new Regex(@"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                        string smname = "MdGridView_t_unitemployees_dwyg$_SearchGo";
                        //Match m = Regex.Match(msg, @"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                        //<input[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?value=["'](?<val>[^'"]*?)["'][^<>]*?/?>|<input[^<>]*?name=["'](?<name>[^'"]*?)["'][^<>]*?["'][^<>]*?/?>|<select[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?["'][^<>]*?/?>
                        //new Regex(@"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                        Dictionary<string, string> dicParms = new Dictionary<string, string>();
                        dicParms.Add("ScriptManager1", $"UpdatePanel1|{smname}");
                        dicParms.Add("__EVENTTARGET", smname);
                        dicParms.Add("__EVENTARGUMENT", "");
                        dicParms.Add("__VIEWSTATE", "");
                        dicParms.Add("__EVENTVALIDATION", "");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", ix.ToString());
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0");
                        //dicParms.Add("MdGridView_t_unitemployees_dwyg$_SearchTextBox", ix.ToString());

                        dicParms.Add("XM", "ZXMCHECK");
                        List<string> lstParms = new List<string>() { "ScriptManager1", "XM", "MdGridView_t_unitemployees_dwyg_iCurrentNum",  "Button_Query", "__EVENTTARGET", "__EVENTARGUMENT", "Button_SelQuery", "Button_view", "Button_edit", "Button_out", "ImageButton_Tx", "ImageButton_xx1", "Button_qd",  "__ASYNCPOST", "MdGridView_t_unitemployees_dwyg__PageSetText" };
                        mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                        foreach (Match mi in mc)
                        {

                            if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
                                continue;

                            if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
                                dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
                            else
                                dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
                        }
                        if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_PageSetText"))
                            dicParms["MdGridView_t_unitemployees_dwyg$_PageSetText"] = "1";
                        else
                            dicParms.Add("MdGridView_t_unitemployees_dwyg$_PageSetText", "1");//1200條 每頁
                        if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg_iPageCount"))
                            dicParms["MdGridView_t_unitemployees_dwyg_iPageCount"] = "1";
                        else
                            dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1");
                        if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg_iPageSize"))
                            dicParms["MdGridView_t_unitemployees_dwyg_iPageSize"] = "1";
                        else
                            dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1");

                        if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_SearchTextBox"))
                            dicParms["MdGridView_t_unitemployees_dwyg$_SearchTextBox"] = $"{ix}";
                        else
                            dicParms.Add("MdGridView_t_unitemployees_dwyg$_SearchTextBox", $"{ix}");/*第幾頁*/

                        dicParms["MdGridView_t_unitemployees_dwyg_iTotalPage"] = dicParms["MdGridView_t_unitemployees_dwyg_iTotalCount"];
                        msg = Post(url, dicParms, SuccessCallback, FailCallback);

                        //獲取TR
                        //<tr[^<>]*?name="SelectTR"[^<>]*?>.*?</tr>
                        //new Regex(@"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                        mc = Regex.Matches(msg, @"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                        foreach (Match mi in mc)
                        {
                            //獲取td
                            //(?<=<td[^<>]*?>).*?(?=</td>)
                            //new Regex("(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                            MatchCollection mic = Regex.Matches(mi.Value, "(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                            if (!dicSznyEmployees.ContainsKey(ix))
                            {
                                dicSznyEmployees.Add(ix, new Dictionary<string, string>());
                            }
                            //queueSznyEmployeeInfo.Enqueue(ix);

                            if (!dicSznyEmployees[ix].ContainsKey("UserName"))
                                dicSznyEmployees[ix].Add("UserName", mic[2].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["UserName"] = mic[2].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("PersonID"))
                                dicSznyEmployees[ix].Add("PersonID", mic[3].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["PersonID"] = mic[3].Value.Trim().Replace("&nbsp;", "");

                            if (!dicSznyEmployees[ix].ContainsKey("Birthday"))
                                dicSznyEmployees[ix].Add("Birthday", mic[4].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["Birthday"] = mic[4].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("Sex"))
                                dicSznyEmployees[ix].Add("Sex", mic[5].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["Sex"] = mic[5].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("HomePhone"))
                                dicSznyEmployees[ix].Add("HomePhone", mic[6].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["HomePhone"] = mic[6].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("TelPhone"))
                                dicSznyEmployees[ix].Add("TelPhone", mic[7].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["TelPhone"] = mic[7].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("Mail"))
                                dicSznyEmployees[ix].Add("Mail", mic[8].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["Mail"] = mic[8].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("Address"))
                                dicSznyEmployees[ix].Add("Address", mic[9].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["Address"] = mic[9].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("MinZu"))
                                dicSznyEmployees[ix].Add("MinZu", mic[10].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["MinZu"] = mic[10].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("AddressJiGuan"))
                                dicSznyEmployees[ix].Add("AddressJiGuan", mic[11].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["AddressJiGuan"] = mic[11].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("ZhengZhiMianmao"))
                                dicSznyEmployees[ix].Add("ZhengZhiMianmao", mic[12].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["ZhengZhiMianmao"] = mic[12].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("Paiqianshijian"))
                                dicSznyEmployees[ix].Add("Paiqianshijian", mic[13].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["Paiqianshijian"] = mic[13].Value.Trim().Replace("&nbsp;", "");
                            if (!dicSznyEmployees[ix].ContainsKey("Remark"))
                                dicSznyEmployees[ix].Add("Remark", mic[14].Value.Trim().Replace("&nbsp;", ""));
                            else
                                dicSznyEmployees[ix]["Remark"] = mic[14].Value.Trim().Replace("&nbsp;", "");

                        }
                        dicParms.Clear();
                        mc = Regex.Matches(msg, @"(?<name>__VIEWSTATE)\|(?<v>[^\|]+)|(?<name>__EVENTVALIDATION)\|(?<v>[^\|]+)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                        foreach (Match mi in mc)
                        {
                            dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["v"].Value.Trim());
                        }

                        dicParms.Add("HiddenField_param", "");
                        dicParms.Add("__EVENTTARGET", "");
                        dicParms.Add("__EVENTARGUMENT", "");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", ix.ToString());
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1");
                        dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0");
                        dicParms.Add("XM", "ZXMCHECK");
                        lstParms.Clear();
                        lstParms = new List<string>() { "XM", "__EVENTTARGET", "__EVENTARGUMENT", "Button_Query", "Button_SelQuery", };
                        lstParms.Add("Button_edit");
                        lstParms.Add("Button_out");
                        lstParms.Add("ImageButton_Tx");
                        lstParms.Add("ImageButton_xx1");
                        lstParms.Add("Button_qd");

                        mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                        foreach (Match mi in mc)
                        {
                            if (lstParms.Contains(mi.Groups["name"].Value.Trim()))
                                continue;
                            if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim()))
                                dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim();
                            else
                                dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim());
                        }
                        ReqSznyEmployeeInfo(ix, dicParms, SuccessCallback, FailCallback);
                    }
                }
            }
            catch (Exception ex) {
                Business.queueSznyEmployeeInfo.Enqueue(ix);
                Business.queueMsg.Enqueue($"{DateTime.Now.ToString("yyy-MM-dd HH:mm:ss")}{ex.Message}");
            }
        }

獲取工號

    public static void ReqSznyEmployeeInfo(int ix,Dictionary<string,string> dicParms, Action<string> SuccessCallback, Action<string> FailCallback) {
            StringBuilder data = new StringBuilder();
            foreach (var kv in dicParms)
            {
                if (kv.Key.StartsWith("header"))
                    continue;
                data.Append($"&{Common.UrlEncode(kv.Key, Encoding.UTF8)}={ Common.UrlEncode(kv.Value, Encoding.UTF8)}");
            }
            if (data.Length > 0)
                data.Remove(0, 1);
            HttpWebRequest req = WebRequest.Create("http://127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx") as HttpWebRequest;
            req.Method = "POST";
            req.KeepAlive = true;
            req.CookieContainer = sznyCookie;
            req.ContentType = "application/x-www-form-urlencoded";
            req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
            if (dicParms.ContainsKey("ScriptManager1"))
            {
                req.Headers.Add("X-MicrosoftAjax", "Delta=true");
                req.Headers.Add("X-Requested-With", "XMLHttpRequest");
                req.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
                req.Accept = "*/*";
            }
            req.Headers.Add("Cache-Control", "max-age=0");
            req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
            req.ServicePoint.ConnectionLimit = int.MaxValue;
            req.ServicePoint.Expect100Continue = false;
            req.AllowAutoRedirect = true;
            req.Credentials = System.Net.CredentialCache.DefaultCredentials;
            byte[] buffer = Encoding.UTF8.GetBytes(data.ToString());
            using (Stream reqStream = req.GetRequestStream())
            {
                reqStream.Write(buffer, 0, buffer.Length);
            }
            req.BeginGetResponse(new AsyncCallback(RspSznyEmployeeInfo), new object[] { req,ix, dicParms, SuccessCallback, FailCallback });
        }

        private static void RspSznyEmployeeInfo(IAsyncResult result)
        {
            object[] parms = result.AsyncState as object[];
            HttpWebRequest req = parms[0] as HttpWebRequest;
            int ix =int.Parse( parms[1].ToString());
            Dictionary<string, string> dicParms = parms[2] as Dictionary<string, string>;
            Action<string> SuccessCallback = parms[3] as Action<string>;
            Action<string> FailCallback = parms[4] as Action<string>;
            try
            {
                using (HttpWebResponse rsp = req.EndGetResponse(result) as HttpWebResponse)
                {
                    using (StreamReader reader = new StreamReader(rsp.GetResponseStream()))
                    {
                        string msg = "";
                        msg = reader.ReadToEnd();

                        string code = "無";
                        
                        //<input[^<>]*?name\s*?=\s*?["']TextBox_YG_Code_str["'][^<>]*?value\s*?=\s*?["'](?<code>[^"']*?)["']|<input[^<>]*?value\s*?=\s*?["'](?<code>[^"']*?)["'][^<>]*?name\s*?=\s*?["']TextBox_YG_Code_str["']
                        //new Regex(@"<input[^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""'][^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""']|<input[^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""'][^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""']", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture)
                        Match m = Regex.Match(msg, @"<input[^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""'][^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""']|<input[^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""'][^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""']", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
                        if (m.Success)
                            code = m.Groups["code"].Value.Trim();
                        if (dicSznyEmployees[ix].ContainsKey("Code"))
                            dicSznyEmployees[ix]["Code"] = code;
                        else
                            dicSznyEmployees[ix].Add("Code", code);
                        queueSuccessEmployeeInfo.Enqueue(ix);
                    }
                }
            }
            catch (Exception ex) {
                Business.queueSznyEmployeeInfo.Enqueue(ix);
                Business.queueMsg.Enqueue($"{DateTime.Now.ToString("yyy-MM-dd HH:mm:ss")}{ex.Message}");
            }
        }

 

入庫

採集到的資訊,通過定時任務儲存到資料庫。

        Task.Factory.StartNew(() => {
                  while (true) {
                    if (Business.queueSuccessEmployeeInfo.Count <= 0) {
                        Thread.Sleep(1000);
                        continue;
                    }
                    List<Dictionary<string, string>> lst = new List<Dictionary<string, string>>();
                    while (Business.queueSuccessEmployeeInfo.Count > 0) {
                        Business.queueSuccessEmployeeInfo.TryDequeue(out int ix);
                        lst.Add(Business.dicSznyEmployees[ix]);
                        if (lst.Count >= 50)
                            break;
                    }
                    DbAccess.AddTran(lst, "SznyEmployee",new List<string>() { "UserName", "PersonID" });

                    Thread.Sleep(1);
                }
            });

 

總結

採集的時候,為了能利用已經採集到的資訊,而不是重複採集,在採集的時候對資料庫資料進行判斷是否存在。純粹是為了提高效率,WebForm的網站真是太慢,太慢了

以前寫非同步純粹是為了提高執行緒效率,在.NET中感覺不到快樂。

終於搞定了,資料已經成功入庫了。

.NET的沒落也是有原因的,網站的速度的確是慢,.net押寶.net core的新體驗了。

我討厭採集WEBForm網站,寫了這麼久的爬蟲,祈禱永遠不要在碰到WEBFORM了。​

 

 

 

 秀一下結果 慶祝一下把