-
Notifications
You must be signed in to change notification settings - Fork 561
/
Copy pathCrawlDecisionMaker.cs
157 lines (124 loc) · 7.83 KB
/
CrawlDecisionMaker.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
using Abot2.Poco;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
namespace Abot2.Core
{
/// <summary>
/// Determines what pages should be crawled, whether the raw content should be downloaded and if the links on a page should be crawled
/// </summary>
public interface ICrawlDecisionMaker
{
/// <summary>
/// Decides whether the page should be crawled
/// </summary>
CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext);
/// <summary>
/// Decides whether the page's links should be crawled
/// </summary>
CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext);
/// <summary>
/// Decides whether the page's content should be dowloaded
/// </summary>
CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext);
/// <summary>
/// Decides whether the page should be re-crawled
/// </summary>
CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext);
}
public class CrawlDecisionMaker : ICrawlDecisionMaker
{
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if(pageToCrawl == null)
return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };
if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };
if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };
//TODO Do we want to ignore redirect chains (ie.. do not treat them as separate page crawls)?
if (!pageToCrawl.IsRetry &&
crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
{
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
}
var pagesCrawledInThisDomain = 0;
if (!pageToCrawl.IsRetry &&
crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
pagesCrawledInThisDomain > 0)
{
if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
}
if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
return new CrawlDecision { Allow = false, Reason = "Link is external" };
return new CrawlDecision { Allow = true };
}
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
{
if (crawledPage == null)
return new CrawlDecision{Allow = false, Reason = "Null crawled page"};
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if(string.IsNullOrWhiteSpace(crawledPage.Content.Text))
return new CrawlDecision { Allow = false, Reason = "Page has no content" };
if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal)
return new CrawlDecision { Allow = false, Reason = "Link is external" };
if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth)
return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };
return new CrawlDecision{Allow = true};
}
public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext)
{
if (crawledPage == null)
return new CrawlDecision { Allow = false, Reason = "Null crawled page" };
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if (crawledPage.HttpResponseMessage == null)
return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" };
if (crawledPage.HttpResponseMessage.StatusCode != HttpStatusCode.OK)
return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" };
var pageContentType = crawledPage.HttpResponseMessage.Content.Headers.ContentType.ToString().ToLower().Trim();
var isDownloadable = false;
var cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes
.Split(',')
.Select(t => t.Trim())
.Where(t => !string.IsNullOrEmpty(t))
.ToList();
foreach (var downloadableContentType in cleanDownloadableContentTypes)
{
if (pageContentType.Contains(downloadableContentType.ToLower().Trim()))
{
isDownloadable = true;
break;
}
}
if (!isDownloadable)
return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes) };
if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpResponseMessage.Content.Headers.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes)
return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.HttpResponseMessage.Content.Headers.ContentLength, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) };
return new CrawlDecision { Allow = true };
}
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
{
if (crawledPage == null)
return new CrawlDecision { Allow = false, Reason = "Null crawled page" };
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if (crawledPage.HttpRequestException == null)
return new CrawlDecision { Allow = false, Reason = "WebException did not occur"};
if (crawlContext.CrawlConfiguration.MaxRetryCount < 1)
return new CrawlDecision { Allow = false, Reason = "MaxRetryCount is less than 1"};
if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount)
return new CrawlDecision {Allow = false, Reason = "MaxRetryCount has been reached"};
return new CrawlDecision { Allow = true };
}
}
}