Golang | 用gocolly登录B站
概述
gocolly是golang语言开发的爬虫包,通过gocolly来进行模拟网站的登录,
目标已经确立,开始行动。
下载
$ go get -u github.com/gocolly/colly
获取cookie
登录网站:https://www.bilibili.com/
获取cookie:
- google浏览器访问
bilibili
- 按F12
- 查看
NetWork
选项 - 点击
Doc
- 查看 cookie信息
colly爬虫代码流程
初始化一个Collector
收集器
c := colly.NewCollector(
colly.AllowedDomains("www.bilibili.com"),
colly.AllowURLRevisit(),
colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"))
爬取网站的规则设置:
err := c.Limit(&colly.LimitRule{
// Filter domains affected by this rule
// 筛选受此规则影响的域
DomainGlob: "bilibili.com/*",
// Set a delay between requests to these domains
// 设置对这些域的请求之间的延迟
Delay: 1 * time.Second,
// Add an additional random delay
// 添加额外的随机延迟
RandomDelay: 1 * time.Second,
// 设置并发
Parallelism: 5,
})
开始爬取
err = c.Visit(url)
c.Wait()
代码示例
package main
import (
"fmt"
"github.com/gocolly/colly"
"net/http"
"os"
"strings"
"time"
)
/*
请求执行之前调用
- OnRequest
响应返回之后调用
- OnResponse
监听执行 selector
- OnHTML
监听执行 selector
- OnXML
错误回调
- OnError
完成抓取后执行,完成所有工作后执行
- OnScraped
取消监听,参数为 selector 字符串
- OnHTMLDetach
取消监听,参数为 selector 字符串
- OnXMLDetach
*/
// set cookies raw
func setCookieRaw(cookieRaw string) []*http.Cookie {
// 可以添加多个cookie
var cookies []*http.Cookie
cookieList := strings.Split(cookieRaw, "; ")
for _, item := range cookieList {
keyValue := strings.Split(item, "=")
// fmt.Println(keyValue)
name := keyValue[0]
valueList := keyValue[1:]
cookieItem := http.Cookie{
Name: name,
Value: strings.Join(valueList, "="),
}
cookies = append(cookies, &cookieItem)
}
return cookies
}
func main() {
c := colly.NewCollector(
colly.AllowedDomains("www.bilibili.com"),
colly.AllowURLRevisit(),
colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"))
err := c.Limit(&colly.LimitRule{
// Filter domains affected by this rule
// 筛选受此规则影响的域
DomainGlob: "bilibili.com/*",
// Set a delay between requests to these domains
// 设置对这些域的请求之间的延迟
Delay: 1 * time.Second,
// Add an additional random delay
// 添加额外的随机延迟
RandomDelay: 1 * time.Second,
// 设置并发
Parallelism: 5,
})
if err != nil {
fmt.Println("fad:", err)
}
header := map[string]string{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Connection": "keep-alive",
"Host": "https://www.bilibili.com/",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
}
url := "https://www.bilibili.com/"
// 在提出请求之前打印 "访问…"
c.OnRequest(func(r *colly.Request) {
for key, value := range header {
r.Headers.Add(key, value)
}
fmt.Println("url: ", r.URL.String())
cookie := "xxx"
err := c.SetCookies(url, setCookieRaw(cookie))
if err != nil {
fmt.Println("fad:", err)
}
})
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
target := e.Attr("target")
name := e.ChildText("span")
if len(name) > 0 {
fmt.Println("name: ", name)
if target == "_blank" && name == "动态"{
link := e.Attr("href")
fmt.Println("link: ", link)
}
}
})
err = c.Visit(url)
if err != nil {
fmt.Errorf("fffffff %s\n", err.Error())
os.Exit(-1)
}
c.Wait()
fmt.Println("程序结束")
}
--完--
- 原文作者: 留白
- 原文链接: https://zfunnily.github.io/2020/12/gocolly/
- 更新时间:2024-04-16 01:01:05
- 本文声明:转载请标记原文作者及链接