diff --git a/crawler/engine/engine.go b/crawler/engine/engine.go
new file mode 100644
index 0000000000000000000000000000000000000000..f469048e7c95f166bfe8951e89958c7358892c19
--- /dev/null
+++ b/crawler/engine/engine.go
@@ -0,0 +1,33 @@
+package engine
+
+import (
+ "learngo/crawler/fetcher"
+ "log"
+)
+
+func Run(seeds ...Request) {
+ var requests []Request // requet 队列
+ // 请求加入队列
+ for _, r := range seeds {
+ requests = append(requests, r)
+ }
+ for len(requests) > 0 {
+ r := requests[0] // 获取第一个请求request
+ requests = requests[1:] // 截取
+ log.Printf(" parserUrl :%s", r.Url)
+ body, error := fetcher.Fetch(r.Url)
+
+ if error != nil {
+ log.Printf("Fetcher :error fetcher url %s : %v", r.Url, error)
+ continue
+
+ }
+ parserResult := r.ParserFunc(body) // body 传给解析器,返回解析结果
+ requests = append(requests,
+ parserResult.Requests...)
+ //打印解析后的item 城市名称
+ for _, item := range parserResult.Items {
+ log.Printf(" got item %v", item)
+ }
+ }
+}
diff --git a/crawler/engine/types.go b/crawler/engine/types.go
new file mode 100644
index 0000000000000000000000000000000000000000..12a9ddc6b83c8b3e421eaf96bf3f8652f15963b7
--- /dev/null
+++ b/crawler/engine/types.go
@@ -0,0 +1,15 @@
+package engine
+
+type Request struct {
+ Url string
+ ParserFunc func([]byte) ParserResult
+}
+
+type ParserResult struct {
+ Requests []Request
+ Items []interface{}
+}
+
+func NilParser([]byte) ParserResult {
+ return ParserResult{}
+}
diff --git a/crawler/fetcher/fetcher.go b/crawler/fetcher/fetcher.go
new file mode 100644
index 0000000000000000000000000000000000000000..1e4c6989b21cdc01090f33fd19959e5610b6f360
--- /dev/null
+++ b/crawler/fetcher/fetcher.go
@@ -0,0 +1,49 @@
+package fetcher
+
+import (
+ "bufio"
+ "fmt"
+ "golang.org/x/net/html/charset"
+ "golang.org/x/text/encoding"
+ "golang.org/x/text/encoding/unicode"
+ "golang.org/x/text/transform"
+ "io"
+ "io/ioutil"
+ "log"
+ "net/http"
+)
+
+func Fetch(url string) ([]byte, error) {
+ resp, err := http.Get(url)
+ if err != nil {
+ //panic(err)
+ return nil, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ fmt.Println(" Error: status code ", resp.StatusCode)
+ return nil, fmt.Errorf(" wrong status code: %d", resp.StatusCode)
+ }
+ // 自动发现见面编码
+ e := determineEncoding(resp.Body)
+ // 把内容以GBK编码读取
+ utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
+ return ioutil.ReadAll(utf8Reader)
+}
+
+func determineEncoding(r io.Reader) encoding.Encoding {
+
+ bytes, err := bufio.NewReader(r).Peek(1024)
+ if err != nil {
+ log.Printf("Fetcher error : %v", err)
+ return unicode.UTF8 //返回默认UTF8
+ }
+ // 返回 encoding
+ e, _, _ := charset.DetermineEncoding(
+ bytes,
+ "",
+ )
+ //fmt.Printf(" name %s certain %s", name, certain)
+ return e
+}
diff --git a/crawler/main.go b/crawler/main.go
index cdda725290d83135d4a9b81c32b97bc24ad28894..194f63e564fcb1b898e0c0f785da39c55bb32bbe 100644
--- a/crawler/main.go
+++ b/crawler/main.go
@@ -8,11 +8,21 @@ import (
"golang.org/x/text/transform"
"io"
"io/ioutil"
+ "learngo/crawler/engine"
+ "learngo/crawler/zhenai/paser"
"net/http"
+ "regexp"
)
func main() {
+ engine.Run(
+ engine.Request{
+ Url: "https://www.zhenai.com/zhenghun",
+ ParserFunc: paser.ParseCityList,
+ })
+}
+func mains() {
resp, err := http.Get("https://www.zhenai.com/zhenghun")
if err != nil {
@@ -35,7 +45,9 @@ func main() {
panic(err)
}
- fmt.Printf("%s\n", all)
+ //fmt.Printf("%s\n", all)
+
+ printCityList(all)
}
@@ -52,3 +64,13 @@ func determineEncoding(r io.Reader) encoding.Encoding {
fmt.Printf(" name %s certain %s", name, certain)
return e
}
+
+func printCityList(contents []byte) {
+
+ re := regexp.MustCompile(`]*>([^<]+)`)
+ mastches := re.FindAllSubmatch(contents, -1)
+ fmt.Printf("查找%s 城市", len(mastches))
+ for _, m := range mastches {
+ fmt.Printf("City: %s ,URL: %s\n", m[2], m[1])
+ }
+}
diff --git a/crawler/zhenai/paser/citylist.go b/crawler/zhenai/paser/citylist.go
new file mode 100644
index 0000000000000000000000000000000000000000..622f3155f78c21f2262ddf9ca191eef6ef8fc6bc
--- /dev/null
+++ b/crawler/zhenai/paser/citylist.go
@@ -0,0 +1,29 @@
+package paser
+
+import (
+ "learngo/crawler/engine"
+ "regexp"
+)
+
+/**
+城市列表解析器
+*/
+const cityListRe = `]*>([^<]+)`
+
+func ParseCityList(contents []byte) engine.ParserResult {
+ re := regexp.MustCompile(cityListRe)
+ mastches := re.FindAllSubmatch(contents, -1) // 0 是 整条路径 ,1 路径 ,2 名称
+ //fmt.Printf("查找%s 城市",len(mastches))
+ result := engine.ParserResult{}
+ for _, m := range mastches {
+ // 把城市名做为返回值,加入items 中
+ result.Items = append(result.Items, string(m[2]))
+ result.Requests = append(
+ result.Requests, engine.Request{
+ Url: string(m[1]),
+ ParserFunc: engine.NilParser,
+ })
+ //fmt.Printf("City: %s ,URL: %s\n",m[2],m[1])
+ }
+ return result
+}