1. 程式人生 > >如何打造網站克隆仿站工具、提供原始碼下載(.NET版)

如何打造網站克隆仿站工具、提供原始碼下載(.NET版)

前兩天朋友叫我模仿一個網站,剛剛開始,我一個頁面一個頁面檢視原始碼並複製和儲存,花了我很多時間,一個字“累”,為了減輕工作量,我寫了個網站“克隆工具”,一鍵克隆,比起人工操作,  效率提高了200%以上,精確度也大大提高,雖然網上也很多網站克隆工具,但我覺得作為一個程式設計師,要有點研究精神,哈哈哈,可以根據自己的需要隨意編寫自己需要的功能。

下面我將我寫的“網站克隆工具”實現方法分享給大家,原始碼在文末有下載連結(.NET開發的,VS2012開發工具),有需要的朋友可以下載來玩,也可以根據自己的需要做相應的修改或優化。

一睹為快,先看看介面:

簡單的工作流程:

專案程式碼目錄結構:

下面一步步實現程式功能:

1.新建主介面窗體(MainForm.cs):

2.新建模型類(UrlModel.cs)

1

2

3

4

5

6

7

8

9

10

11

public class UrlModel

{

public string RelatedPath { getset; }

public string AbsoluteUri { getset; }

public string CurrPath { getset

; }

public string RootPath { getset; }

public string Host { getset; }

public int Port { getset; }

public string Scheme { getset; }

}

3.新建服務類(Services)

UrlParser:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

public class UrlParser

{

public static UrlModel Parse(string url)

{

UrlModel model = new UrlModel();

//預設

if (url.Length < 8)

throw new Exception("url引數不正確");

else if (!url.ToLower().StartsWith("http:") && !url.ToLower().StartsWith("https:"))

throw new Exception("url格式有誤");

if (url.LastIndexOf('/') < 8)

url = url + "/";

Regex reg = new Regex("(?<scheme>(http|https))://(?<host>.+?)/", RegexOptions.Singleline);

if (reg.IsMatch(url))

{

string scheme = reg.Match(url).Groups["scheme"].Value;

string host = reg.Match(url).Groups["host"].Value;

if (host.Contains(":"))

{

var aa = host.Split(':');

if (aa.Length == 2)

{

model.Host = aa[0];

model.Port = int.Parse(aa[1]);

}

}

else

{

model.Host = host;

model.Port = 80;

}

int index = url.IndexOf('/', 8);

model.RelatedPath = url.Substring(index);

model.AbsoluteUri = url;

model.Scheme = scheme;

model.CurrPath = url.Substring(0, url.LastIndexOf("/"));

if (80 == model.Port)

{

model.RootPath = string.Format("{0}://{1}", model.Scheme, model.Host);

}

else

{

model.RootPath = string.Format("{0}://{1}:{2", model.Scheme, model.Host, model.Port);

}

}

else

{

throw new Exception("url解析失敗!");

}

return model;

}

}

WebPageService:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

/// <summary>

/// 網頁處理服務工具

/// </summary>

public class WebPageService

{

private static string[] excludekeys = { "http:""https:""//""#""javascript:""?""tel:""mailto:" };

/// <summary>

/// 獲取所有html元素的href屬性值,只獲取站點本地的連結,站外的不獲取

/// </summary>

/// <param name="html">頁面的html原始碼</param>

/// <returns></returns>

public static List<UrlModel> GetLocalHrefs(string url,string html)

{

if (string.IsNullOrEmpty(html))

return new List<UrlModel>();

Dictionary<string, UrlModel> urls = GetHrefs(url,html);

List<UrlModel> newUrls = new List<UrlModel>();

if (null != urls)

{

foreach (string key in urls.Keys)

{

string newkey = key.ToLower();

bool iscontained = false;

foreach (var exkey in excludekeys)

{

if (newkey.IndexOf(exkey) == 0)

{

iscontained = true;

break;

}

}

if (!iscontained) {

//只獲取本地路徑

newUrls.Add(urls[key]);

}

}

}

return newUrls;

}

/// <summary>

/// 獲取所有html元素的src屬性值,只獲取站點本地的連結,站外的不獲取

/// </summary>

/// <param name="html">頁面的html原始碼</param>

/// <returns></returns>

public static List<UrlModel> GetLocalSrcs(string url,string html)

{

if (string.IsNullOrEmpty(html))

return new List<UrlModel>();

Dictionary<string, UrlModel> urls = GetSrc(url, html);

List<UrlModel> newUrls = new List<UrlModel>();

if (null != urls)

{

foreach (string key in urls.Keys)

{

string newkey = key.ToLower();

bool iscontained = false;

foreach (var exkey in excludekeys)

{

if (newkey.IndexOf(exkey) == 0)

{

iscontained = true;

break;

}

}

if (!iscontained)

{

//只獲取本地路徑

newUrls.Add(urls[key]);

}

}

}

return newUrls;

}

private static Dictionary<string, UrlModel> GetHrefs(string url,string html)

{

if (string.IsNullOrEmpty(html))

return null;

UrlModel currUrl = UrlParser.Parse(url);

Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();

Regex reg = new Regex("href=\"(?<Url>.+?)\"", RegexOptions.IgnoreCase);

if (currUrl != null)

{

AddUrlModel(html, currUrl, urls, reg);

}

return urls;

}

private static Dictionary<string, UrlModel> GetSrc(string url,string html)

{

if (string.IsNullOrEmpty(html))

return null;

UrlModel currUrl = UrlParser.Parse(url);

Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();

Regex reg = new Regex("(src=\"(?<Url>.+?)\"|url\\((?<Url>.+?)\\))", RegexOptions.IgnoreCase);

if (currUrl != null)

{

AddUrlModel(html, currUrl, urls, reg);

}

return urls;

}

private static void AddUrlModel(string html, UrlModel currUrl, Dictionary<string, UrlModel> urls, Regex reg)

{

if (reg.IsMatch(html))

{

MatchCollection matchs = reg.Matches(html);

foreach (Match item in matchs)

{

try

{

string strUrl = item.Groups["Url"].Value;

UrlModel model = new UrlModel();

model.RelatedPath = strUrl;

model.CurrPath = currUrl.CurrPath;

model.RootPath = currUrl.RootPath;

model.Scheme = currUrl.Scheme;

model.Port = currUrl.Port;

model.Host = currUrl.Host;

if (strUrl.StartsWith("/"))

{

//絕對目錄情況下

model.AbsoluteUri = string.Format("{0}{1}", model.RootPath, model.RelatedPath);

}

else

{

//相對目錄情況下

string currPath = model.CurrPath;

int depth = 0;

string path = model.RelatedPath;

if (path.StartsWith(".."))

{

try

{

while (path.StartsWith(".."))

{

depth++;

path = path.Substring(3);

currPath = currPath.Substring(0, currPath.LastIndexOf("/"));

}

model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);

}

catch

{

}

}

else

{

model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);

}

}

strUrl = strUrl.Trim().ToLower();

urls.Add(strUrl, model);

}

catch

{

}

}

}

}

}

4.網頁原始碼扒取類

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

public class HttpTool

{

public static string HttpGet(string url, string referer, string encoding, out string msg)

{

msg = string.Empty;

string result = string.Empty;

try

{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

//request.ContentType = "application/x-www-form-urlencoded";

request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";

request.Referer = referer;

request.Method = "GET";

request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36";

//request.Headers.Add("Accept-Language", "zh-cn");

//request.Headers.Add("Accept-Encoding", "gzip,deflate");

request.Timeout = 60000;//一分鐘

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

Stream responseStream = response.GetResponseStream();

if (responseStream != null)

{

StreamReader reader = new StreamReader(responseStream, System.Text.Encoding.GetEncoding(encoding));

result = reader.ReadToEnd();

reader.Close();

responseStream.Close();

request.Abort();

response.Close();

return result.Trim();

}

}

catch (Exception ex)

{

msg = ex.Message + ex.StackTrace;

}

return result;

}

public static void DownFile(string uRLAddress, string localPath, string filename)

{

WebClient client = new WebClient();

Stream str = client.OpenRead(uRLAddress);

StreamReader reader = new StreamReader(str);

byte[] mbyte = new byte[1000000];

int allmybyte = (int)mbyte.Length;

int startmbyte = 0;

while (allmybyte > 0)

{

int m = str.Read(mbyte, startmbyte, allmybyte);

if (m == 0)

{

break;

}

startmbyte += m;

allmybyte -= m;

}

reader.Dispose();

str.Dispose();

string path = Path.Combine(localPath, filename);

FileStream fstr = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write);

fstr.Write(mbyte, 0, startmbyte);

fstr.Flush();

fstr.Close();

}

}

5.網站克隆主類

介面:

interface IWebCloneWorker
    {
        void Start();
        void Cancel();
    }

實現類:

複製程式碼

public class WebCloneWorker : IWebCloneWorker
    {
        //網站頁面克隆深度(如:0-首頁,1-分類頁,2-詳細頁面)
        public static int depth = 0;
        
        //要克隆的網站網址
        public string Url { get; set; }

        //克隆後,儲存的路徑
        public string SavePath { get; set; }

        private BackgroundWorker backgroundWorker1 = null;
        public event UrlChangedEventHandler UrlChanged;
        public event FileSavedSuccessEventHandler FileSavedSuccess;
        public event FileSavedFailEventHandler FileSavedFail;
        public event DownloadCompletedEventHandler DownloadCompleted;
        public event CollectingUrlEventHandler CollectingUrl;
        public event CollectedUrlEventHandler CollectedUrl;
        public event ProgressChangedEventHandler ProgressChanged;

        //所有頁面、檔案資源地址集合
        private Dictionary<string, UrlModel> _Hrefs = new Dictionary<string, UrlModel>();

        /// <summary>
        /// 所有頁面、檔案資源地址集合
        /// </summary>
        public Dictionary<string,UrlModel> Hrefs
        {
            get { return _Hrefs; }
            set { _Hrefs = value; }
        }

        //網站頁面請求編碼,預設為UTF-8
        private string _Encoding = "utf-8";

        //網站頁面請求編碼,預設為UTF-8
        public string Encoding
        {
            get { return _Encoding; }
            set { _Encoding = value; }
        }

        public WebCloneWorker() { }

        public WebCloneWorker(string url,string path) 
        {
            //設定網站、儲存路徑
            this.Url = url;
            this.SavePath = path;

            if (string.IsNullOrEmpty(this.Url))
                throw new Exception("請輸入網址");

            if (string.IsNullOrEmpty(this.SavePath))
                throw new Exception("請選擇要儲存的目錄");

            backgroundWorker1 = new BackgroundWorker();

            //設定報告進度更新
            backgroundWorker1.WorkerReportsProgress = true;
            backgroundWorker1.WorkerSupportsCancellation = true;

            //註冊執行緒主體方法
            backgroundWorker1.DoWork += backgroundWorker1_DoWork;

            //註冊更新UI方法
            backgroundWorker1.ProgressChanged += backgroundWorker1_ProgressChanged;

            //處理完畢
            backgroundWorker1.RunWorkerCompleted += backgroundWorker1_RunWorkerCompleted;
        }

        void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
        {
            if (e.Cancelled) {
                return;
            }

            if (this.DownloadCompleted != null)
            {
                DownloadCompletedEventArgs eventArgs = new DownloadCompletedEventArgs(e.Result, e.Error, e.Cancelled);
                this.DownloadCompleted(this, eventArgs);
            }
        }

        void backgroundWorker1_ProgressChanged(object sender, ProgressChangedEventArgs e)
        {
            //進度回撥
            if (this.ProgressChanged != null) 
                this.ProgressChanged(this, e);

            UrlModel model = (UrlModel)e.UserState;

            if (this.UrlChanged != null)
            {
                //Url改變後,回撥
                UrlChangedEventArgs eventArgs = new UrlChangedEventArgs(model);
                this.UrlChanged(this, eventArgs);
            }

            try
            {
                string dir = this.SavePath;
                string url = model.AbsoluteUri;
                string AbsolutePath = url.Substring(url.IndexOf('/', 8));
                string fileName = "";

                if (url.IndexOf('?') > 0)
                {
                    string path = AbsolutePath.Substring(0, model.RelatedPath.IndexOf('?'));
                    fileName = System.IO.Path.GetFileName(path);
                }
                else
                {
                    fileName = System.IO.Path.GetFileName(AbsolutePath);
                }

                //預設首頁
                if (string.IsNullOrEmpty(fileName) || fileName.IndexOf(".") < 0)
                {
                    fileName = "index.html";

                    if (!AbsolutePath.EndsWith("/"))
                        AbsolutePath = AbsolutePath + "/";
                }

                fileName = System.Web.HttpUtility.UrlDecode(fileName);

                string localPath = string.Format("{0}{1}", dir, System.IO.Path.GetDirectoryName(AbsolutePath));
                if (!System.IO.Directory.Exists(localPath))
                {
                    System.IO.Directory.CreateDirectory(localPath);
                }

                //判斷檔案是否存在,存在不再下載
                string path2 = Path.Combine(localPath, fileName);
                if (File.Exists(path2))
                {
                    return;
                }

                //下載網頁、圖片、資原始檔
                HttpTool.DownFile(url, localPath, fileName);

                //儲存成功後,回撥
                if (this.FileSavedSuccess != null)
                {
                    FileSavedSuccessEventArgs eventArgs = new FileSavedSuccessEventArgs(model);
                    this.FileSavedSuccess(this, eventArgs);
                }
            }
            catch (Exception ex)
            {
                //儲存失敗後,回撥
                if (this.FileSavedFail != null)
                {
                    FileSavedFailEventArgs eventArgs = new FileSavedFailEventArgs(ex);
                    this.FileSavedFail(this, eventArgs);
                }
            }
        }

        void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
        {
            //獲取資源
            GetResource();

            int index = 1;
            if (this.Hrefs.Keys.Count > 0)
            {
                foreach (var k in this.Hrefs.Keys)
                {
                    //取消操作
                    if (backgroundWorker1.CancellationPending)
                    {
                        e.Cancel = true;
                        return;
                    }

                    backgroundWorker1.ReportProgress(index, this.Hrefs[k]);
                    index++;

                    //掛起當前執行緒200毫秒
                    Thread.Sleep(200);
                }
            }
        }

        public void Start()
        {
            if (this.backgroundWorker1.IsBusy)
                return;

            this.backgroundWorker1.RunWorkerAsync();
        }

        public void Cancel()
        {
            if (this.backgroundWorker1.CancellationPending)
                return;

            this.backgroundWorker1.CancelAsync();
        }
        
        private void GetResource()
        {
            string url = this.Url;
            string referer = this.Url;
            string msg = "";
            string html = HttpTool.HttpGet(url, referer, this.Encoding, out msg);

            //收集頁面連結
            GetHrefs(0, url, html);

            //收集完畢
            if (null != CollectedUrl)
            {
                UrlModel urlModel = new UrlModel();
                CollectedUrlEventArgs eventArgs = new CollectedUrlEventArgs(urlModel);
                this.CollectedUrl(this, eventArgs);
            }

        }

        private void GetHrefs(int level,string url,string html)
        {
            #region 添加當前頁

            UrlModel currUrl = UrlParser.Parse(url);

            try
            {
                //取消
                if (backgroundWorker1.CancellationPending)
                    return;

                this.Hrefs.Add(currUrl.RelatedPath, currUrl);

                //收集回撥
                if (null != CollectingUrl)
                {
                    CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(currUrl);
                    this.CollectingUrl(this, eventArgs);
                }
            }
            catch
            {
            }

            #endregion

            //獲取相關連結(含有href屬性的)
            List<UrlModel> list1 = WebPageService.GetLocalHrefs(url,html);

            //獲取圖片,檔案等資原始檔(含有src屬性的)
            List<UrlModel> listSrcs = WebPageService.GetLocalSrcs(url,html);

            #region 獲取當級資原始檔

            if (listSrcs != null)
            {
                for (int i = 0; i < listSrcs.Count; i++)
                {
                    UrlModel urlModel = listSrcs[i];
                    try
                    {
                        //取消
                        if (backgroundWorker1.CancellationPending) 
                            return;

                        this.Hrefs.Add(urlModel.RelatedPath, urlModel);

                        //收集回撥
                        if (null != CollectingUrl)
                        {
                            CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
                            this.CollectingUrl(this, eventArgs);
                        }
                    }
                    catch
                    { }
                }
            }

            #endregion

            #region 獲取子級頁面資源

            //獲取第二級
            if (list1 != null)
            {
                for (int i = 0; i < list1.Count; i++)
                {
                    UrlModel urlModel = list1[i];

                    try
                    {
                        //取消
                        if (backgroundWorker1.CancellationPending)
                            return;

                        this.Hrefs.Add(urlModel.RelatedPath, urlModel);

                        //收集回撥
                        if (null != CollectingUrl)
                        {
                            CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
                            this.CollectingUrl(this, eventArgs);
                        }
                    }
                    catch
                    { }

                    string msg = "";
                    html = HttpTool.HttpGet(urlModel.AbsoluteUri, urlModel.AbsoluteUri, this.Encoding, out msg);

                    #region 獲取子級資原始檔

                    /*
                     * 獲取二級資原始檔
                     * */
                    listSrcs = WebPageService.GetLocalSrcs(urlModel.AbsoluteUri, html);//資原始檔

                    if (listSrcs != null)
                    {
                        for (int j = 0; j < listSrcs.Count; j++)
                        {
                            UrlModel urlModel2 = listSrcs[j];

                            try
                            {
                                //取消
                                if (backgroundWorker1.CancellationPending)
                                    return;

                                this.Hrefs.Add(urlModel2.RelatedPath, urlModel2);

                                //收集回撥
                                if (null != CollectingUrl)
                                {
                                    CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel2);
                                    this.CollectingUrl(this, eventArgs);
                                }
                            }
                            catch
                            { }

                            //掛起執行緒20毫秒
                            Thread.Sleep(20);
                        }
                    }
                    #endregion

                    //掛起執行緒20毫秒
                    Thread.Sleep(20);

                    //到達指定深度後,退出
                    if (level >= depth)
                        return;

                    //遞迴
                    GetHrefs(level + 1, urlModel.AbsoluteUri, html);
                }
            }

            #endregion
        }
    }

複製程式碼

6.一些事件、委託類:

複製程式碼

public delegate void UrlChangedEventHandler(object sender, UrlChangedEventArgs e);
    public delegate void FileSavedSuccessEventHandler(object sender, FileSavedSuccessEventArgs e);
    public delegate void FileSavedFailEventHandler(object sender, FileSavedFailEventArgs e);
    public delegate void DownloadCompletedEventHandler(object sender, DownloadCompletedEventArgs e);
    public delegate void CollectingUrlEventHandler(object sender, CollectingUrlEventArgs e);
    public delegate void CollectedUrlEventHandler(object sender, CollectedUrlEventArgs e);
    public delegate void ProgressChangedEventHandler(object sender, ProgressChangedEventArgs e);

複製程式碼

public class CollectedUrlEventArgs : EventArgs
public class CollectingUrlEventArgs : EventArgs
public class DownloadCompletedEventArgs : EventArgs
public class FileSavedFailEventArgs : EventArgs
public class FileSavedSuccessEventArgs : EventArgs
public class UrlChangedEventArgs : EventArgs

程式碼有點多,各位有需要的還是下載原始碼檢視並執行吧,由於趕時間,沒時間仔細測試程式的各個功能,難免有不足的地方。

百度網盤:連結:https://pan.baidu.com/s/1hja1rl9UEcl0dzTqVFt0dg 密碼:7s6r