From 30eb3273eeb610535940e841150e544c3e8b17e2 Mon Sep 17 00:00:00 2001 From: feilong Date: Sat, 17 Apr 2021 11:38:20 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E8=BF=9B=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tag_source/stackoverflow.py | 2 +- src/tag_source/vscode.py | 17 +---------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/tag_source/stackoverflow.py b/src/tag_source/stackoverflow.py index 0c9ad50..98ac2f2 100644 --- a/src/tag_source/stackoverflow.py +++ b/src/tag_source/stackoverflow.py @@ -9,7 +9,7 @@ from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings class StackOverflowTagSpider(scrapy.Spider): - name = "vscode_tags" + name = "stackoverflow_tags" allowed_domains = ["visualstudio.com"] start_urls = ['https://stackoverflow.com/tags/synonyms?page=1'] custom_settings = { diff --git a/src/tag_source/vscode.py b/src/tag_source/vscode.py index eafba58..4f9bee3 100644 --- a/src/tag_source/vscode.py +++ b/src/tag_source/vscode.py @@ -7,29 +7,14 @@ import scrapy from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings -class CategoryItem(scrapy.Item): - name = scrapy.Field() - addr = scrapy.Field() - -class TagItem(scrapy.Item): - name = scrapy.Field() - class VSCodeTagSpider(scrapy.Spider): name = "vscode_tags" allowed_domains = ["visualstudio.com"] - # start_urls = ['https://marketplace.visualstudio.com/search?target=VSCode&category=All%20categories&sortBy=Installs'] - start_urls = ['https://stackoverflow.com/tags/synonyms?page=1'] + start_urls = ['https://marketplace.visualstudio.com/search?target=VSCode&category=All%20categories&sortBy=Installs'] def parse(self, response): print('todo') -class Categoryline(object): - def process_item(self, item, spider): - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'} - req = urllib.request.Request(url=item['addr'],headers=headers) - res = urllib.request.urlopen(req) - - def fetch(): settings = Settings() process = CrawlerProcess() -- GitLab