|
1 | 1 | # alink
|
2 |
| -- [中文](#中文) |
| 2 | +- [ZH](#简介) |
3 | 3 |
|
4 | 4 | Golang package to read href,video,title ... tags from an HTML page。
|
5 | 5 |
|
6 | 6 |
|
| 7 | +## 简介 |
7 | 8 |
|
8 |
| -## 中文 |
9 |
| -一个Golang package 用来读取HTML页面中的 <title> ,<video>,<a> 等元素 |
10 |
| -输入一个 http.Get 返回的 response 使用 html.Parse 解析后返回一个字符串数组指针 |
| 9 | +一个简单的Golang package 主要用来读取HTML页面中的 ``` <title> ,<video>,<a>``` 等元素. |
| 10 | +通过 ```alink.NewRespBody``` 方法处理可以读取 ```http.Get``` 返回的```response.Body```内容。 |
| 11 | +注意如果要多次读取使用io.Reader 要通过 ```body, err := ioutil.ReadAll(b.Body)```读取后再次新建 ``` readerHref := bytes.NewReader(body)``` 的方式来进行。 |
| 12 | +内部使用html.Parse 解析后返回一个字符串数组指针。 |
11 | 13 |
|
| 14 | +### 例子 Example |
12 | 15 |
|
| 16 | +- 一个读取google/baidu主页的例子。获取页面的title和全部a连接并打印出来 |
13 | 17 |
|
14 |
| -### 例子 |
15 |
| -```go |
| 18 | +- Use http client Get google/baidu Index Page and collect tags title ,href |
16 | 19 |
|
| 20 | +```go |
17 | 21 | package main
|
| 22 | + |
18 | 23 | import (
|
19 |
| - "https://github.com/gitgitcode/alink" |
20 |
| - "golang.org/x/net/html" |
21 |
| - "fmt" |
| 24 | + "github.com/gitgitcode/alink" |
| 25 | + "fmt" |
| 26 | + "log" |
| 27 | + "math/rand" |
| 28 | + "net/http" |
| 29 | + "time" |
22 | 30 | )
|
| 31 | +var userAgentList = []string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)", |
| 32 | + "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)", |
| 33 | + "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)", |
| 34 | + "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,", |
| 35 | + "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11", |
| 36 | + "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)", |
| 37 | + "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", |
| 38 | + "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", |
| 39 | + "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", |
| 40 | + "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", |
| 41 | + "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", |
| 42 | + "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)", |
| 43 | + "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", |
| 44 | + "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"} |
| 45 | +func GetRandomUserAgent() string{ |
| 46 | + r := rand.New(rand.NewSource(time.Now().UnixNano())) |
| 47 | + return userAgentList[r.Intn(len(userAgentList))] |
| 48 | +} |
| 49 | +var accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" |
| 50 | + |
| 51 | +func main() { |
| 52 | + |
| 53 | + str:="https://google.co.jp" |
| 54 | + str1:="https://www.baidu.com" |
| 55 | + |
| 56 | + //fmt.Print(alink.IsValidUrl(str1)) |
| 57 | + client:= http.Client{Timeout: 2 * time.Second} |
| 58 | + req,err := http.NewRequest("GET",str,nil) |
| 59 | + req1,err1 := http.NewRequest("GET",str1,nil) |
23 | 60 |
|
24 |
| -func main(){ |
25 |
| - resp,_ := http.Get("http://www.testtest.com") |
26 |
| - newResp ,err := alink.NewRespBody(resp.Body) |
27 |
| - if err !=nil{ |
28 |
| - log.Print(err.Error()) |
29 |
| - } |
30 |
| - links,_ := alink.Alink(newResp) |
31 |
| - fmt.Println(links) |
| 61 | + if err != nil{ |
| 62 | + log.Printf("google is err:%s",err.Error()) |
| 63 | + } |
| 64 | + |
| 65 | + if err1 != nil{ |
| 66 | + log.Printf("baidu is err:%s",err1.Error()) |
| 67 | + } |
| 68 | + |
| 69 | + ReqAdd(req) |
| 70 | + ReqAdd(req1) |
| 71 | + b,err := client.Do(req) |
| 72 | + defer client.CloseIdleConnections() |
| 73 | + |
| 74 | + if err != nil{ |
| 75 | + log.Printf("request google err %s",err.Error()) |
| 76 | + b1,err1 := client.Do(req1) |
| 77 | + if err1 !=nil{ |
| 78 | + log.Printf("request baidu err %s",err.Error()) |
| 79 | + return |
| 80 | + } |
| 81 | + b = b1 |
| 82 | + } |
| 83 | + |
| 84 | + body, err := ioutil.ReadAll(b.Body) |
| 85 | + if err !=nil{ |
| 86 | + panic(err) |
| 87 | + } |
| 88 | + //for read twice create new reader |
| 89 | + readerHref := bytes.NewReader(body) |
| 90 | + //创建两个新 reader |
| 91 | + readerTitle := bytes.NewReader(body) |
32 | 92 |
|
| 93 | + t,f := alink.Title(readerTitle) |
| 94 | + |
| 95 | + if f !=nil { |
| 96 | + log.Print(f) |
| 97 | + } |
| 98 | + fmt.Printf("title:%s \n",t) |
| 99 | + |
| 100 | + a,bl := alink.Alink(readerHref) |
| 101 | + |
| 102 | + |
| 103 | + if bl { |
| 104 | + for i,v := range *a{ |
| 105 | + fmt.Printf("index:%d=href:%s\n",i,v) |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + |
| 110 | + |
| 111 | + //title:百度一下,你就知道 |
| 112 | + //index:0=href:/ |
| 113 | + // index:1=href:javascript:; |
| 114 | + // index:2=href:https://passport.baidu.com/v2 |
| 115 | + //or |
| 116 | +//title:Google |
| 117 | + //index:0=href:/ |
| 118 | + // index:1=href:javascript:; |
| 119 | + // index:2=href:https://wwww.google.com/ |
| 120 | + |
33 | 121 | }
|
| 122 | + |
| 123 | +func ReqAdd(req *http.Request) { |
| 124 | + req.Header.Set("Cookie","sug=3; a=1; ORIGIN=0; bdime=21110") |
| 125 | + req.Header.Add("User-Agent",GetRandomUserAgent() ) |
| 126 | + req.Header.Add("Accept",accept) |
| 127 | + req.Header.Add("Upgrade-Insecure-Requests","1") |
| 128 | +} |
| 129 | + |
34 | 130 | ```
|
0 commit comments