8000 edit readme · gitgitcode/alink@91091b6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 91091b6

Browse files
committed
edit readme
1 parent d0d8d11 commit 91091b6

File tree

3 files changed

+118
-19
lines changed

3 files changed

+118
-19
lines changed

README.md

Lines changed: 113 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,130 @@
11
# alink
2-
- [中文](#中文)
2+
- [ZH](#简介)
33

44
Golang package to read href,video,title ... tags from an HTML page。
55

66

7+
## 简介
78

8-
## 中文
9-
一个Golang package 用来读取HTML页面中的 <title> ,<video>,<a> 等元素
10-
输入一个 http.Get 返回的 response 使用 html.Parse 解析后返回一个字符串数组指针
9+
一个简单的Golang package 主要用来读取HTML页面中的 ``` <title> ,<video>,<a>``` 等元素.
10+
通过 ```alink.NewRespBody``` 方法处理可以读取 ```http.Get``` 返回的```response.Body```内容。
11+
注意如果要多次读取使用io.Reader 要通过 ```body, err := ioutil.ReadAll(b.Body)```读取后再次新建 ``` readerHref := bytes.NewReader(body)``` 的方式来进行。
12+
内部使用html.Parse 解析后返回一个字符串数组指针。
1113

14+
### 例子 Example
1215

16+
- 一个读取google/baidu主页的例子。获取页面的title和全部a连接并打印出来
1317

14-
### 例子
15-
```go
18+
- Use http client Get google/baidu Index Page and collect tags title ,href
1619

20+
```go
1721
package main
22+
1823
import (
19-
"https://github.com/gitgitcode/alink"
20-
"golang.org/x/net/html"
21-
"fmt"
24+
"github.com/gitgitcode/alink"
25+
"fmt"
26+
"log"
27+
"math/rand"
28+
"net/http"
29+
"time"
2230
)
31+
var userAgentList = []string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
32+
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
33+
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
34+
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
35+
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
36+
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
37+
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
38+
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
39+
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
40+
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
41+
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
42+
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
43+
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
44+
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
45+
func GetRandomUserAgent() string{
46+
r := rand.New(rand.NewSource(time.Now().UnixNano()))
47+
return userAgentList[r.Intn(len(userAgentList))]
48+
}
49+
var accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
50+
51+
func main() {
52+
53+
str:="https://google.co.jp"
54+
str1:="https://www.baidu.com"
55+
56+
//fmt.Print(alink.IsValidUrl(str1))
57+
client:= http.Client{Timeout: 2 * time.Second}
58+
req,err := http.NewRequest("GET",str,nil)
59+
req1,err1 := http.NewRequest("GET",str1,nil)
2360

24-
func main(){
25-
resp,_ := http.Get("http://www.testtest.com")
26-
newResp ,err := alink.NewRespBody(resp.Body)
27-
if err !=nil{
28-
log.Print(err.Error())
29-
}
30-
links,_ := alink.Alink(newResp)
31-
fmt.Println(links)
61+
if err != nil{
62+
log.Printf("google is err:%s",err.Error())
63+
}
64+
65+
if err1 != nil{
66+
log.Printf("baidu is err:%s",err1.Error())
67+
}
68+
69+
ReqAdd(req)
70+
ReqAdd(req1)
71+
b,err := client.Do(req)
72+
defer client.CloseIdleConnections()
73+
74+
if err != nil{
75+
log.Printf("request google err %s",err.Error())
76+
b1,err1 := client.Do(req1)
77+
if err1 !=nil{
78+
log.Printf("request baidu err %s",err.Error())
79+
return
80+
}
81+
b = b1
82+
}
83+
84+
body, err := ioutil.ReadAll(b.Body)
85+
if err !=nil{
86+
panic(err)
87+
}
88+
//for read twice create new reader
89+
readerHref := bytes.NewReader(body)
90+
//创建两个新 reader
91+
readerTitle := bytes.NewReader(body)
3292

93+
t,f := alink.Title(readerTitle)
94+
95+
if f !=nil {
96+
log.Print(f)
97+
}
98+
fmt.Printf("title:%s \n",t)
99+
100+
a,bl := alink.Alink(readerHref)
101+
102+
103+
if bl {
104+
for i,v := range *a{
105+
fmt.Printf("index:%d=href:%s\n",i,v)
106+
}
107+
}
108+
109+
110+
111+
//title:百度一下,你就知道
112+
//index:0=href:/
113+
// index:1=href:javascript:;
114+
// index:2=href:https://passport.baidu.com/v2
115+
//or
116+
//title:Google
117+
//index:0=href:/
118+
// index:1=href:javascript:;
119+
// index:2=href:https://wwww.google.com/
120+
33121
}
122+
123+
func ReqAdd(req *http.Request) {
124+
req.Header.Set("Cookie","sug=3; a=1; ORIGIN=0; bdime=21110")
125+
req.Header.Add("User-Agent",GetRandomUserAgent() )
126+
req.Header.Add("Accept",accept)
127+
req.Header.Add("Upgrade-Insecure-Requests","1")
128+
}
129+
34130
```

alink.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ func isImgElement(n *html.Node) bool {
5656
//get page title
5757
func titleText(n *html.Node) (string, bool) {
5858
if isTitleElement(n) {
59+
//log.Print(n)
5960
return n.FirstChild.Data, true
6061
}
6162
for c := n.FirstChild; c != nil; c = c.NextSibling {
@@ -86,7 +87,7 @@ func videoSrc(node *html.Node) (string, bool) {
8687
}
8788

8889
//get video src
89-
func Video(httpBody *bytes.Reader) (s [] string, err error) {
90+
func VideoSrc(httpBody *bytes.Reader) (s [] string, err error) {
9091
var src []string
9192
node, err := html.Parse(httpBody)
9293
if err != nil {
@@ -107,7 +108,9 @@ func Title(httpBody *bytes.Reader) (t string, err error) {
107108
if err != nil {
108109
return title,err
109110
}
111+
110112
title, _ = titleText(node)
113+
111114
return title, nil
112115
}
113116

alink_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ func TestVideo(t *testing.T) {
132132
//log.Print(tests[0].wantS)
133133
for _, tt := range tests {
134134
t.Run(tt.name, func(t *testing.T) {
135-
gotS, err := Video(tt.args.httpBody)
135+
gotS, err := VideoSrc(tt.args.httpBody)
136136

137137
if (err != nil) != tt.wantErr {
138138
t.Errorf("Video() error = %v, wantErr %v", err, tt.wantErr)

0 commit comments

Comments
 (0)
0