-
Notifications
You must be signed in to change notification settings - Fork 561
/
Copy pathCrawlConfiguration.cs
250 lines (205 loc) · 10.7 KB
/
CrawlConfiguration.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
using System.Collections.Generic;
namespace Abot2.Poco
{
public class CrawlConfiguration
{
public CrawlConfiguration()
{
MaxConcurrentThreads = 10;
UserAgentString = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36";
RobotsDotTextUserAgentString = "abot";
MaxPagesToCrawl = 1000;
DownloadableContentTypes = "text/html";
ConfigurationExtensions = new Dictionary<string, string>();
MaxRobotsDotTextCrawlDelayInSeconds = 5;
HttpRequestMaxAutoRedirects = 7;
IsHttpRequestAutoRedirectsEnabled = true;
MaxCrawlDepth = 100;
HttpServicePointConnectionLimit = 200;
HttpRequestTimeoutInSeconds = 15;
IsSslCertificateValidationEnabled = false;
}
#region crawlBehavior
/// <summary>
/// Max concurrent threads to use for http requests
/// </summary>
public int MaxConcurrentThreads { get; set; }
/// <summary>
/// Maximum number of pages to crawl.
/// If zero, this setting has no effect
/// </summary>
public int MaxPagesToCrawl { get; set; }
/// <summary>
/// Maximum number of pages to crawl per domain
/// If zero, this setting has no effect.
/// </summary>
public int MaxPagesToCrawlPerDomain { get; set; }
/// <summary>
/// Maximum size of page. If the page size is above this value, it will not be downloaded or processed
/// If zero, this setting has no effect.
/// </summary>
public int MaxPageSizeInBytes { get; set; }
/// <summary>
/// The user agent string to use for http requests
/// </summary>
public string UserAgentString { get; set; }
/// <summary>
/// The http protocol version number to use during http requests. Currently supporting values "1.1" and "1.0".
/// </summary>
public HttpProtocolVersion HttpProtocolVersion { get; set; }
/// <summary>
/// Maximum seconds before the crawl times out and stops.
/// If zero, this setting has no effect.
/// </summary>
public int CrawlTimeoutSeconds { get; set; }
/// <summary>
/// Dictionary that stores additional key-value pairs that can be accessed through the crawl pipeline
/// </summary>
public Dictionary<string, string> ConfigurationExtensions { get; set; }
/// <summary>
/// Whether Uris should be crawled more than once. This is not common and should be false for most scenarios
/// </summary>
public bool IsUriRecrawlingEnabled { get; set; }
/// <summary>
/// Whether pages external to the root uri should be crawled
/// </summary>
public bool IsExternalPageCrawlingEnabled { get; set; }
/// <summary>
/// Whether pages external to the root uri should have their links crawled. NOTE: IsExternalPageCrawlEnabled must be true for this setting to have any effect
/// </summary>
public bool IsExternalPageLinksCrawlingEnabled { get; set; }
/// <summary>
/// Whether or not url named anchors or hashbangs are considered part of the url. If false, they will be ignored. If true, they will be considered part of the url.
/// </summary>
public bool IsRespectUrlNamedAnchorOrHashbangEnabled { get; set; }
/// <summary>
/// A comma separated string that has content types that should have their page content downloaded. For each page, the content type is checked to see if it contains any of the values defined here.
/// </summary>
public string DownloadableContentTypes { get; set; }
/// <summary>
/// Gets or sets the maximum number of concurrent connections allowed by a System.Net.ServicePoint. The system default is 2. This means that only 2 concurrent http connections can be open to the same host.
/// If zero, this setting has no effect.
/// </summary>
public int HttpServicePointConnectionLimit { get; set; }
/// <summary>
/// Gets or sets the time-out value in seconds for the System.Net.HttpWebRequest.GetResponse() and System.Net.HttpWebRequest.GetRequestStream() methods.
/// If zero, this setting has no effect.
/// </summary>
public int HttpRequestTimeoutInSeconds { get; set; }
/// <summary>
/// Gets or sets the maximum number of redirects that the request follows.
/// If zero, this setting has no effect.
/// </summary>
public int HttpRequestMaxAutoRedirects { get; set; }
/// <summary>
/// Gets or sets a value that indicates whether the request should follow redirection
/// </summary>
public bool IsHttpRequestAutoRedirectsEnabled { get; set; }
/// <summary>
/// Gets or sets a value that indicates gzip and deflate will be automatically accepted and decompressed
/// </summary>
public bool IsHttpRequestAutomaticDecompressionEnabled { get; set; }
/// <summary>
/// Whether the cookies should be set and resent with every request
/// </summary>
public bool IsSendingCookiesEnabled { get; set; }
/// <summary>
/// Whether or not to validate the server SSL certificate. If true, the default validation will be made.
/// If false, the certificate validation is bypassed. This setting is useful to crawl sites with an
/// invalid or expired SSL certificate.
/// </summary>
public bool IsSslCertificateValidationEnabled { get; set; }
/// <summary>
/// Uses closest multiple of 16 to the value set. If there is not at least this much memory available before starting a crawl, throws InsufficientMemoryException.
/// If zero, this setting has no effect.
/// </summary>
/// <exception cref="http://msdn.microsoft.com/en-us/library/system.insufficientmemoryexception.aspx">InsufficientMemoryException</exception>
public int MinAvailableMemoryRequiredInMb { get; set; }
/// <summary>
/// The max amount of memory to allow the process to use. If this limit is exceeded the crawler will stop prematurely.
/// If zero, this setting has no effect.
/// </summary>
public int MaxMemoryUsageInMb { get; set; }
/// <summary>
/// The max amount of time before refreshing the value used to determine the amount of memory being used by the process that hosts the crawler instance.
/// This value has no effect if MaxMemoryUsageInMb is zero.
/// </summary>
public int MaxMemoryUsageCacheTimeInSeconds { get; set; }
/// <summary>
/// Maximum levels below root page to crawl. If value is 0, the homepage will be crawled but none of its links will be crawled. If the level is 1, the homepage and its links will be crawled but none of the links links will be crawled.
/// </summary>
public int MaxCrawlDepth { get; set; }
/// <summary>
/// Maximum links to crawl per page.
/// If value is zero, this setting has no effect.
/// </summary>
public int MaxLinksPerPage { get; set; }
/// <summary>
/// Gets or sets a value that indicates whether the crawler should parse the page's links even if a CrawlDecision (like CrawlDecisionMaker.ShouldCrawlPageLinks()) determines that those links will not be crawled.
/// </summary>
public bool IsForcedLinkParsingEnabled { get; set; }
/// <summary>
/// The max number of retries for a url if a web exception is encountered. If the value is 0, no retries will be made
/// </summary>
public int MaxRetryCount { get; set; }
/// <summary>
/// The minimum delay between a failed http request and the next retry
/// </summary>
public int MinRetryDelayInMilliseconds { get; set; }
#endregion
#region politeness
/// <summary>
/// Whether the crawler should retrieve and respect the robots.txt file.
/// </summary>
public bool IsRespectRobotsDotTextEnabled { get; set; }
/// <summary>
/// Whether the crawler should ignore links on pages that have a <meta name="robots" content="nofollow" /> tag
/// </summary>
public bool IsRespectMetaRobotsNoFollowEnabled { get; set; }
/// <summary>
/// Whether the crawler should ignore links on pages that have an http X-Robots-Tag header of nofollow
/// </summary>
public bool IsRespectHttpXRobotsTagHeaderNoFollowEnabled { get; set; }
/// <summary>
/// Whether the crawler should ignore links that have a <a href="whatever" rel="nofollow" />...
/// </summary>
public bool IsRespectAnchorRelNoFollowEnabled { get; set; }
/// <summary>
/// If true, will ignore the robots.txt file if it disallows crawling the root uri.
/// </summary>
public bool IsIgnoreRobotsDotTextIfRootDisallowedEnabled { get; set; }
/// <summary>
/// The user agent string to use when checking robots.txt file for specific directives. Some examples of other crawler's user agent values are "googlebot", "slurp" etc...
/// </summary>
public string RobotsDotTextUserAgentString { get; set; }
/// <summary>
/// The number of milliseconds to wait in between http requests to the same domain.
/// </summary>
public int MinCrawlDelayPerDomainMilliSeconds { get; set; }
/// <summary>
/// The maximum numer of seconds to respect in the robots.txt "Crawl-delay: X" directive.
/// IsRespectRobotsDotTextEnabled must be true for this value to be used.
/// If zero, will use whatever the robots.txt crawl delay requests no matter how high the value is.
/// </summary>
public int MaxRobotsDotTextCrawlDelayInSeconds { get; set; }
#endregion
#region Authorization
/// <summary>
/// Defines whether each request should be authorized via login
/// </summary>
public bool IsAlwaysLogin { get; set; }
/// <summary>
/// The user name to be used for authorization
/// </summary>
public string LoginUser { get; set; }
/// <summary>
/// The password to be used for authorization
/// </summary>
public string LoginPassword { get; set; }
/// <summary>
/// Specifies whether to use default credentials.
/// </summary>
public bool UseDefaultCredentials { get; set; }
#endregion
}
}