概述

gocolly是golang语言开发的爬虫包,通过gocolly来进行模拟网站的登录,
目标已经确立,开始行动。

下载

$ go get -u github.com/gocolly/colly

获取cookie

登录网站:https://www.bilibili.com/
获取cookie:

  • google浏览器访问bilibili
  • 按F12
  • 查看NetWork选项
  • 点击Doc
  • 查看 cookie信息

colly爬虫代码流程

初始化一个Collector收集器

c := colly.NewCollector(
		colly.AllowedDomains("www.bilibili.com"),
		colly.AllowURLRevisit(),
		colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"))

爬取网站的规则设置:

err := c.Limit(&colly.LimitRule{
		// Filter domains affected by this rule
		// 筛选受此规则影响的域
		DomainGlob: "bilibili.com/*",
		// Set a delay between requests to these domains
		// 设置对这些域的请求之间的延迟
		Delay: 1 * time.Second,
		// Add an additional random delay
		// 添加额外的随机延迟
		RandomDelay: 1 * time.Second,
		// 设置并发
		Parallelism: 5,
	})

开始爬取

err = c.Visit(url)
c.Wait()
代码示例
package main

import (
	"fmt"
	"github.com/gocolly/colly"
	"net/http"
	"os"
	"strings"
	"time"
)
/*
请求执行之前调用
	- OnRequest
响应返回之后调用
	- OnResponse
监听执行 selector
	- OnHTML
监听执行 selector
	- OnXML
错误回调
	- OnError
完成抓取后执行,完成所有工作后执行
	- OnScraped
取消监听,参数为 selector 字符串
	- OnHTMLDetach
取消监听,参数为 selector 字符串
	- OnXMLDetach
*/
// set cookies raw
func setCookieRaw(cookieRaw string) []*http.Cookie {
	// 可以添加多个cookie
	var cookies []*http.Cookie
	cookieList := strings.Split(cookieRaw, "; ")
	for _, item := range cookieList {
		keyValue := strings.Split(item, "=")
		// fmt.Println(keyValue)
		name := keyValue[0]
		valueList := keyValue[1:]
		cookieItem := http.Cookie{
			Name:  name,
			Value: strings.Join(valueList, "="),
		}
		cookies = append(cookies, &cookieItem)
	}
	return cookies
}

func main() {
	c := colly.NewCollector(
		colly.AllowedDomains("www.bilibili.com"),
		colly.AllowURLRevisit(),
		colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"))

	err := c.Limit(&colly.LimitRule{
		// Filter domains affected by this rule
		// 筛选受此规则影响的域
		DomainGlob: "bilibili.com/*",
		// Set a delay between requests to these domains
		// 设置对这些域的请求之间的延迟
		Delay: 1 * time.Second,
		// Add an additional random delay
		// 添加额外的随机延迟
		RandomDelay: 1 * time.Second,
		// 设置并发
		Parallelism: 5,
	})
	if err != nil {
		fmt.Println("fad:", err)
	}

	header := map[string]string{
		"Accept":     "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
		"Connection": "keep-alive",
		"Host":       "https://www.bilibili.com/",
		"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
	}
	url := "https://www.bilibili.com/"

	// 在提出请求之前打印 "访问…"
	c.OnRequest(func(r *colly.Request) {
		for key, value := range header {
			r.Headers.Add(key, value)
		}
		fmt.Println("url: ", r.URL.String())
		cookie := "xxx"
		err := c.SetCookies(url, setCookieRaw(cookie))
		if err != nil {
			fmt.Println("fad:", err)
		}
	})
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		target := e.Attr("target")
		name := e.ChildText("span")
		if len(name) > 0 {
			fmt.Println("name: ", name)
			if target == "_blank" && name == "动态"{
				link := e.Attr("href")
				fmt.Println("link: ", link)
			}
		}
	})
	err = c.Visit(url)
	if err != nil {
		fmt.Errorf("fffffff %s\n", err.Error())
		os.Exit(-1)
	}
	c.Wait()

	fmt.Println("程序结束")

}

--完--