Skip to content

Commit da8b2d1

Browse files
committed
update: add EnqueueURL(), new test cases
1 parent 510eea1 commit da8b2d1

File tree

2 files changed

+61
-23
lines changed

2 files changed

+61
-23
lines changed

crawler.go

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,25 @@ type Crawler struct {
2424

2525
// MaxConcurrentRequests specifies the maximum number of concurrent
2626
// requests that will be performed.
27+
// Default is 16.
2728
MaxConcurrentRequests int
2829

2930
// MaxConcurrentRequestsPerHost specifies the maximum number of
3031
// concurrent requests that will be performed to any single domain.
32+
// Default is 1.
3133
MaxConcurrentRequestsPerSite int
3234

33-
// RequestTimeout specifies a time to wait before the request times out.
35+
// RequestTimeout specifies a time to wait before the HTTP Request times out.
36+
// Default is 30s.
3437
RequestTimeout time.Duration
3538

3639
// DownloadDelay specifies delay time to wait before access same website.
40+
// Default is 0.25s.
3741
DownloadDelay time.Duration
3842

3943
// MaxConcurrentItems specifies the maximum number of concurrent items
4044
// to process parallel in the pipeline.
45+
// Default is 32.
4146
MaxConcurrentItems int
4247

4348
// UserAgent specifies the user-agent for the remote server.
@@ -79,21 +84,32 @@ type muxEntry struct {
7984
// StartURLs starts crawling for the given URL list.
8085
func (c *Crawler) StartURLs(URLs []string) {
8186
for _, URL := range URLs {
82-
req, _ := http.NewRequest("GET", URL, nil)
83-
c.Request(req)
87+
c.EnqueueURL(URL)
8488
}
8589
}
8690

87-
// Request puts an HTTP request into the working queue to crawling.
88-
func (c *Crawler) Request(req *http.Request) error {
89-
c.once.Do(c.init)
91+
// Crawl puts an HTTP request into the working queue to crawling.
92+
func (c *Crawler) Crawl(req *http.Request) error {
9093
if req == nil {
9194
return errors.New("req is nil")
9295
}
9396
return c.enqueue(req, 5*time.Second)
9497
}
9598

99+
// EnqueueURL puts given URL into the backup URLs queue.
100+
func (c *Crawler) EnqueueURL(URL string) error {
101+
if URL == "" {
102+
return errors.New("URL is nil")
103+
}
104+
req, err := http.NewRequest("GET", URL, nil)
105+
if err != nil {
106+
return err
107+
}
108+
return c.Crawl(req)
109+
}
110+
96111
func (c *Crawler) enqueue(req *http.Request, timeout time.Duration) error {
112+
c.once.Do(c.init)
97113
select {
98114
case c.readCh <- req:
99115
case <-time.After(timeout):
@@ -127,14 +143,14 @@ func (c *Crawler) Handler(res *http.Response) (h Handler, pattern string) {
127143
}
128144

129145
// UseMiddleware adds a Middleware to the crawler.
130-
func (c *Crawler) UseMiddleware(m Middleware) *Crawler {
131-
c.mids = append(c.mids, m)
146+
func (c *Crawler) UseMiddleware(m ...Middleware) *Crawler {
147+
c.mids = append(c.mids, m...)
132148
return c
133149
}
134150

135151
// UsePipeline adds a Pipeline to the crawler.
136-
func (c *Crawler) UsePipeline(p Pipeline) *Crawler {
137-
c.pipes = append(c.pipes, p)
152+
func (c *Crawler) UsePipeline(p ...Pipeline) *Crawler {
153+
c.pipes = append(c.pipes, p...)
138154
return c
139155
}
140156

@@ -415,9 +431,10 @@ func (c *Crawler) getSpider(url *url.URL) *spider {
415431
s, ok := c.spider[key]
416432
if !ok {
417433
s = &spider{
418-
c: c,
419-
reqch: make(chan requestAndChan),
420-
key: key,
434+
c: c,
435+
reqch: make(chan requestAndChan),
436+
key: key,
437+
idleTimeout: 120 * time.Second,
421438
}
422439
c.spider[key] = s
423440
go s.crawlLoop()
@@ -437,9 +454,10 @@ type responseAndError struct {
437454

438455
// spider is http spider for the single site.
439456
type spider struct {
440-
c *Crawler
441-
reqch chan requestAndChan
442-
key string
457+
c *Crawler
458+
reqch chan requestAndChan
459+
key string
460+
idleTimeout time.Duration
443461
}
444462

445463
func (s *spider) queueScanWorker(workCh chan chan requestAndChan, respCh chan int, closeCh chan struct{}) {
@@ -462,11 +480,9 @@ func (s *spider) queueScanWorker(workCh chan chan requestAndChan, respCh chan in
462480
}
463481

464482
func (s *spider) crawlLoop() {
465-
const idleTimeout = 120 * time.Second
466-
467483
respCh := make(chan int)
468484
closeCh := make(chan struct{})
469-
idleTimer := time.NewTimer(idleTimeout)
485+
idleTimer := time.NewTimer(s.idleTimeout)
470486
workCh := make(chan chan requestAndChan, s.c.maxConcurrentRequestsPerSite())
471487

472488
for i := 0; i < s.c.maxConcurrentRequestsPerSite(); i++ {
@@ -485,7 +501,7 @@ func (s *spider) crawlLoop() {
485501
c := <-workCh
486502
c <- rc
487503
case <-respCh:
488-
idleTimer.Reset(idleTimeout)
504+
idleTimer.Reset(s.idleTimeout)
489505
case <-idleTimer.C:
490506
goto exit
491507
case <-s.c.Exit:

crawler_test.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"net/http/httptest"
77
"net/url"
88
"testing"
9+
"time"
910
)
1011

1112
func TestCrawlerBasic(t *testing.T) {
@@ -60,9 +61,6 @@ func TestCrawlerSpiderMux(t *testing.T) {
6061
}
6162

6263
var tc = NewCrawler()
63-
tc.Handle("*", HandlerFunc(func(c chan<- Item, res *http.Response) {
64-
c <- 0
65-
}))
6664
for _, e := range spiderMuxTests {
6765
tc.Handle(e.pattern, HandlerFunc(func(c chan<- Item, res *http.Response) {
6866
c <- res.StatusCode
@@ -88,3 +86,27 @@ func TestCrawlerSpiderMux(t *testing.T) {
8886
}
8987
}
9088
}
89+
90+
func TestSpiderIdleTimeout(t *testing.T) {
91+
timeout := 10 * time.Millisecond
92+
spider := &spider{
93+
key: "test",
94+
c: &Crawler{},
95+
idleTimeout: timeout,
96+
}
97+
done := make(chan struct{})
98+
var (
99+
start time.Time
100+
end time.Time
101+
)
102+
go func() {
103+
defer close(done)
104+
start = time.Now()
105+
spider.crawlLoop()
106+
end = time.Now()
107+
}()
108+
<-done
109+
if d := end.Sub(start); d < timeout {
110+
t.Errorf("spider's timeout expected <= %s; but %s", timeout, t)
111+
}
112+
}

0 commit comments

Comments
 (0)