colly爬虫 go

2024-04-18 17:11•服务端开发•阅读 3437

package main

import (
        "bufio"
        "fmt"
        "github.com/antchfx/htmlquery"
        "github.com/gocolly/colly"
        "github.com/gocolly/colly/extensions"
        "io/ioutil"
        "log"
        "net/http"
        "os"
        "regexp"
        "strings"
        "sync"
        "time"
)
var wg sync.WaitGroup
var ch chan int

func main() {
        ch = make(chan int ,10)
        var reNotAllow = `http://www.uidzhx.com/du/.*.html`
        c := colly.NewCollector(
                colly.AllowedDomains("www.uidzhx.com"),
                colly.AllowURLRevisit(),
                colly.IgnoreRobotsTxt(),
                colly.DisallowedURLFilters(regexp.MustCompile(reNotAllow)),
                )
        c.AllowURLRevisit = false
        c.Async = false
        extensions.RandomUserAgent(c)
        extensions.Referer(c)

        c.OnRequest(func(r *colly.Request) {
                fmt.Println("Visiting", r.URL.String())
        })


        c.Limit(&colly.LimitRule{
                DomainGlob:  "*",
                //Parallelism: 2,
                RandomDelay: 1 * time.Second,
        })
        
        c.OnHTML("a[href]", func(e *colly.HTMLElement) {
                link := e.Attr("href")
                c.Visit(e.Request.AbsoluteURL(link))
        })

        //收到响应后
        c.OnResponse(func(r *colly.Response) {

                doc, err := htmlquery.Parse(strings.NewReader(string(r.Body)))
                if err != nil {
                        log.Fatal(err)
                }
                title := htmlquery.FindOne(doc, `/html/body/div[4]/div[2]/div[1]/div/div[2]/div/h1`)
                if title != nil{
                        var reTxt = `http://dzs.uidzhx.com.*\.txt`
                        re := regexp.MustCompile(reTxt)
                        url := re.FindString(string(r.Body))
                        if url != "" {
                                txtTitle := strings.Replace(htmlquery.InnerText(title), " ", "", -1)
                                fmt.Println(txtTitle)
                                wg.Add(1)
                                ch <- 1
                                //go saveTxt(txtTitle,url)
                        }
                }
        })


        c.Visit("http://www.uidzhx.com/Shtml89401.html")

        wg.Wait()
}

func saveTxt(title string,url string )  {
        defer wg.Done()
        str := download(url)
        fmt.Println(str)
        fmt.Printf("save txt %s - %s\n",title,url)
        filePath := "d:/crawl/"+title+".txt"
        file, err := os.OpenFile(filePath, os.O_WRONLY | os.O_CREATE, 0666)
        if err != nil {
                fmt.Printf("open file err=%v\n", err)
                return
        }
        //及时关闭file句柄
        defer file.Close()
        //写入时，使用带缓存的 *Writer
        writer := bufio.NewWriter(file)
        for i := 0; i < 5; i++ {
                writer.WriteString(str)
        }
        <- ch
}

func download(url string) string {
        client := &http.Client{}
        req,_ := http.NewRequest("GET",url,nil)

        req.Header.Set("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)")
        resp,err := client.Do(req)
        if err != nil{
                fmt.Print("http get err",err)
                panic("http get err")
        }

        defer resp.Body.Close()

        body,err := ioutil.ReadAll(resp.Body)
        if err != nil{
                fmt.Print("read error ",err)
                panic("read error")
        }
        return string(body)
}

上一篇 »Python爬虫入门：爬虫基础了解
下一篇 »python-爬虫的分类urllib、requests

colly爬虫 go

相关推荐

python3定时爬虫

Python Scrapy爬虫，下

python爬虫爬取腾讯招聘信息 ，静态爬虫

python爬虫抓取哈尔滨天气信息，静态爬虫

python爬虫爬取汽车页面信息，并附带分析，静态爬虫

Go语言基础语法笔记

Go开发之路 -- Go语言基本语法

Go语言语法说明 - kexinxin

python爬虫爬取腾讯招聘信息，静态爬虫