@@ -24,20 +24,25 @@ type Crawler struct {
24
24
25
25
// MaxConcurrentRequests specifies the maximum number of concurrent
26
26
// requests that will be performed.
27
+ // Default is 16.
27
28
MaxConcurrentRequests int
28
29
29
30
// MaxConcurrentRequestsPerHost specifies the maximum number of
30
31
// concurrent requests that will be performed to any single domain.
32
+ // Default is 1.
31
33
MaxConcurrentRequestsPerSite int
32
34
33
- // RequestTimeout specifies a time to wait before the request times out.
35
+ // RequestTimeout specifies a time to wait before the HTTP Request times out.
36
+ // Default is 30s.
34
37
RequestTimeout time.Duration
35
38
36
39
// DownloadDelay specifies delay time to wait before access same website.
40
+ // Default is 0.25s.
37
41
DownloadDelay time.Duration
38
42
39
43
// MaxConcurrentItems specifies the maximum number of concurrent items
40
44
// to process parallel in the pipeline.
45
+ // Default is 32.
41
46
MaxConcurrentItems int
42
47
43
48
// UserAgent specifies the user-agent for the remote server.
@@ -79,21 +84,32 @@ type muxEntry struct {
79
84
// StartURLs starts crawling for the given URL list.
80
85
func (c * Crawler ) StartURLs (URLs []string ) {
81
86
for _ , URL := range URLs {
82
- req , _ := http .NewRequest ("GET" , URL , nil )
83
- c .Request (req )
87
+ c .EnqueueURL (URL )
84
88
}
85
89
}
86
90
87
- // Request puts an HTTP request into the working queue to crawling.
88
- func (c * Crawler ) Request (req * http.Request ) error {
89
- c .once .Do (c .init )
91
+ // Crawl puts an HTTP request into the working queue to crawling.
92
+ func (c * Crawler ) Crawl (req * http.Request ) error {
90
93
if req == nil {
91
94
return errors .New ("req is nil" )
92
95
}
93
96
return c .enqueue (req , 5 * time .Second )
94
97
}
95
98
99
+ // EnqueueURL puts given URL into the backup URLs queue.
100
+ func (c * Crawler ) EnqueueURL (URL string ) error {
101
+ if URL == "" {
102
+ return errors .New ("URL is nil" )
103
+ }
104
+ req , err := http .NewRequest ("GET" , URL , nil )
105
+ if err != nil {
106
+ return err
107
+ }
108
+ return c .Crawl (req )
109
+ }
110
+
96
111
func (c * Crawler ) enqueue (req * http.Request , timeout time.Duration ) error {
112
+ c .once .Do (c .init )
97
113
select {
98
114
case c .readCh <- req :
99
115
case <- time .After (timeout ):
@@ -127,14 +143,14 @@ func (c *Crawler) Handler(res *http.Response) (h Handler, pattern string) {
127
143
}
128
144
129
145
// UseMiddleware adds a Middleware to the crawler.
130
- func (c * Crawler ) UseMiddleware (m Middleware ) * Crawler {
131
- c .mids = append (c .mids , m )
146
+ func (c * Crawler ) UseMiddleware (m ... Middleware ) * Crawler {
147
+ c .mids = append (c .mids , m ... )
132
148
return c
133
149
}
134
150
135
151
// UsePipeline adds a Pipeline to the crawler.
136
- func (c * Crawler ) UsePipeline (p Pipeline ) * Crawler {
137
- c .pipes = append (c .pipes , p )
152
+ func (c * Crawler ) UsePipeline (p ... Pipeline ) * Crawler {
153
+ c .pipes = append (c .pipes , p ... )
138
154
return c
139
155
}
140
156
@@ -415,9 +431,10 @@ func (c *Crawler) getSpider(url *url.URL) *spider {
415
431
s , ok := c .spider [key ]
416
432
if ! ok {
417
433
s = & spider {
418
- c : c ,
419
- reqch : make (chan requestAndChan ),
420
- key : key ,
434
+ c : c ,
435
+ reqch : make (chan requestAndChan ),
436
+ key : key ,
437
+ idleTimeout : 120 * time .Second ,
421
438
}
422
439
c .spider [key ] = s
423
440
go s .crawlLoop ()
@@ -437,9 +454,10 @@ type responseAndError struct {
437
454
438
455
// spider is http spider for the single site.
439
456
type spider struct {
440
- c * Crawler
441
- reqch chan requestAndChan
442
- key string
457
+ c * Crawler
458
+ reqch chan requestAndChan
459
+ key string
460
+ idleTimeout time.Duration
443
461
}
444
462
445
463
func (s * spider ) queueScanWorker (workCh chan chan requestAndChan , respCh chan int , closeCh chan struct {}) {
@@ -462,11 +480,9 @@ func (s *spider) queueScanWorker(workCh chan chan requestAndChan, respCh chan in
462
480
}
463
481
464
482
func (s * spider ) crawlLoop () {
465
- const idleTimeout = 120 * time .Second
466
-
467
483
respCh := make (chan int )
468
484
closeCh := make (chan struct {})
469
- idleTimer := time .NewTimer (idleTimeout )
485
+ idleTimer := time .NewTimer (s . idleTimeout )
470
486
workCh := make (chan chan requestAndChan , s .c .maxConcurrentRequestsPerSite ())
471
487
472
488
for i := 0 ; i < s .c .maxConcurrentRequestsPerSite (); i ++ {
@@ -485,7 +501,7 @@ func (s *spider) crawlLoop() {
485
501
c := <- workCh
486
502
c <- rc
487
503
case <- respCh :
488
- idleTimer .Reset (idleTimeout )
504
+ idleTimer .Reset (s . idleTimeout )
489
505
case <- idleTimer .C :
490
506
goto exit
491
507
case <- s .c .Exit :
0 commit comments