Skip to content

Commit c4281cb

Browse files
committed
add爬虫
1 parent 48771e2 commit c4281cb

21 files changed

+365
-0
lines changed

crawl_360/crawl_360/__init__.py

Whitespace-only changes.

crawl_360/crawl_360/__init__.pyc

140 Bytes
Binary file not shown.

crawl_360/crawl_360/items.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
11+
class Crawl360Item(scrapy.Item):
12+
# define the fields for your item here like:
13+
# name = scrapy.Field()
14+
pass
15+
16+
17+
class ButianItem(scrapy.Item):
18+
author = scrapy.Field() # 作者
19+
company_name = scrapy.Field() # 企业名称
20+
vul_name = scrapy.Field() # SQL注入漏洞
21+
vul_level = scrapy.Field() # 高危
22+
vul_type = scrapy.Field() # 通用型
23+
vul_money = scrapy.Field() # 奖励
24+
vul_find_time = scrapy.Field() # 时间
25+
link_url = scrapy.Field() # 抓取url
26+
create_time = scrapy.Field() # 创建时间

crawl_360/crawl_360/items.pyc

828 Bytes
Binary file not shown.

crawl_360/crawl_360/middlewares.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
from scrapy import signals
9+
10+
11+
class Crawl360SpiderMiddleware(object):
12+
# Not all methods need to be defined. If a method is not defined,
13+
# scrapy acts as if the spider middleware does not modify the
14+
# passed objects.
15+
16+
@classmethod
17+
def from_crawler(cls, crawler):
18+
# This method is used by Scrapy to create your spiders.
19+
s = cls()
20+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21+
return s
22+
23+
def process_spider_input(self, response, spider):
24+
# Called for each response that goes through the spider
25+
# middleware and into the spider.
26+
27+
# Should return None or raise an exception.
28+
return None
29+
30+
def process_spider_output(self, response, result, spider):
31+
# Called with the results returned from the Spider, after
32+
# it has processed the response.
33+
34+
# Must return an iterable of Request, dict or Item objects.
35+
for i in result:
36+
yield i
37+
38+
def process_spider_exception(self, response, exception, spider):
39+
# Called when a spider or process_spider_input() method
40+
# (from other spider middleware) raises an exception.
41+
42+
# Should return either None or an iterable of Response, dict
43+
# or Item objects.
44+
pass
45+
46+
def process_start_requests(self, start_requests, spider):
47+
# Called with the start requests of the spider, and works
48+
# similarly to the process_spider_output() method, except
49+
# that it doesn’t have a response associated.
50+
51+
# Must return only requests (not items).
52+
for r in start_requests:
53+
yield r
54+
55+
def spider_opened(self, spider):
56+
spider.logger.info('Spider opened: %s' % spider.name)
57+
58+
59+
class Crawl360DownloaderMiddleware(object):
60+
# Not all methods need to be defined. If a method is not defined,
61+
# scrapy acts as if the downloader middleware does not modify the
62+
# passed objects.
63+
64+
@classmethod
65+
def from_crawler(cls, crawler):
66+
# This method is used by Scrapy to create your spiders.
67+
s = cls()
68+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69+
return s
70+
71+
def process_request(self, request, spider):
72+
# Called for each request that goes through the downloader
73+
# middleware.
74+
75+
# Must either:
76+
# - return None: continue processing this request
77+
# - or return a Response object
78+
# - or return a Request object
79+
# - or raise IgnoreRequest: process_exception() methods of
80+
# installed downloader middleware will be called
81+
return None
82+
83+
def process_response(self, request, response, spider):
84+
# Called with the response returned from the downloader.
85+
86+
# Must either;
87+
# - return a Response object
88+
# - return a Request object
89+
# - or raise IgnoreRequest
90+
return response
91+
92+
def process_exception(self, request, exception, spider):
93+
# Called when a download handler or a process_request()
94+
# (from other downloader middleware) raises an exception.
95+
96+
# Must either:
97+
# - return None: continue processing this exception
98+
# - return a Response object: stops process_exception() chain
99+
# - return a Request object: stops process_exception() chain
100+
pass
101+
102+
def spider_opened(self, spider):
103+
spider.logger.info('Spider opened: %s' % spider.name)
+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# author: Lock
4+
# time: 2018/4/28 11:35
5+
147 Bytes
Binary file not shown.

crawl_360/crawl_360/models/db.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# author: Lock
4+
# time: 2018/4/28 11:36
5+
6+
from sqlalchemy import create_engine
7+
from sqlalchemy.orm import sessionmaker
8+
from sqlalchemy.ext.declarative import declarative_base
9+
10+
# 创建对象的基类:
11+
Base = declarative_base()
12+
13+
CONFIG = {
14+
'db_host': '127.0.0.1',
15+
'db_user': 'root',
16+
'db_pass': '',
17+
'db_port': 3306,
18+
'db_name': 'crawl'
19+
}
20+
21+
# 初始化数据库连接:
22+
engine = create_engine('mysql+mysqlconnector://%s:%s@%s:%s/%s' % (
23+
CONFIG.get('db_user'),
24+
CONFIG.get('db_pass'),
25+
CONFIG.get('db_host'),
26+
CONFIG.get('db_port'),
27+
CONFIG.get('db_name'),
28+
))
29+
30+
# 创建DBSession类型:
31+
DBSession = sessionmaker(bind=engine)

crawl_360/crawl_360/models/db.pyc

725 Bytes
Binary file not shown.

crawl_360/crawl_360/models/models.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# coding: utf-8
2+
from sqlalchemy import Column, DateTime, Integer, Numeric, String, text
3+
from sqlalchemy.ext.declarative import declarative_base
4+
5+
Base = declarative_base()
6+
metadata = Base.metadata
7+
8+
9+
class Butian(Base):
10+
__tablename__ = 'butian'
11+
12+
id = Column(Integer, primary_key=True)
13+
author = Column(String(100), nullable=False, server_default=text("''"))
14+
company_name = Column(String(100), nullable=False, server_default=text("''"))
15+
vul_level = Column(String(100), nullable=False, server_default=text("''"))
16+
vul_name = Column(String(100), nullable=False, server_default=text("''"))
17+
vul_money = Column(Numeric(10, 2), nullable=False)
18+
vul_find_time = Column(DateTime, nullable=False, server_default=text("'0000-00-00 00:00:00'"))
19+
link_url = Column(String(255), nullable=False, server_default=text("''"))
20+
create_time = Column(DateTime, nullable=False)

crawl_360/crawl_360/models/models.pyc

1.21 KB
Binary file not shown.

crawl_360/crawl_360/pipelines.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
from crawl_360.items import ButianItem
8+
from crawl_360.models.db import DBSession
9+
from crawl_360.models.models import Butian
10+
11+
12+
class Crawl360Pipeline(object):
13+
def __init__(self):
14+
self.db_session = DBSession()
15+
16+
def process_item(self, item, spider):
17+
if isinstance(item, ButianItem):
18+
data_item = Butian(**item)
19+
# 数据入库
20+
self.db_session.add(data_item)
21+
try:
22+
self.db_session.commit()
23+
except Exception, e:
24+
print e.message
25+
self.db_session.rollback()
26+
return item
27+
28+
def close_spider(self, spider):
29+
self.db_session.close()

crawl_360/crawl_360/pipelines.pyc

1.32 KB
Binary file not shown.

crawl_360/crawl_360/reademe/sql.sql

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
CREATE TABLE butian (
2+
id INT(11) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '自增id',
3+
author VARCHAR(100) not null DEFAULT '' COMMENT '作者',
4+
company_name VARCHAR(100) NOT NULL DEFAULT '' COMMENT '公司名',
5+
vul_level VARCHAR(100) not null DEFAULT '' COMMENT '漏洞级别',
6+
vul_name VARCHAR(100) not null DEFAULT '' COMMENT '漏洞名',
7+
vul_money DECIMAL(10,2) not NULL DEFAULT 0 COMMENT '漏洞奖金',
8+
vul_find_time DATETIME not NULL DEFAULT '0000-00-00 00:00:00' COMMENT '漏洞发现时间',
9+
link_url VARCHAR(255) not null DEFAULT '' COMMENT '页面url',
10+
create_time TIMESTAMP not null DEFAULT current_timestamp COMMENT '创建时间',
11+
PRIMARY KEY (id)
12+
)ENGINE=INNODB DEFAULT CHARSET utf8;

crawl_360/crawl_360/settings.py

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for crawl_360 project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# https://doc.scrapy.org/en/latest/topics/settings.html
9+
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME = 'crawl_360'
13+
14+
SPIDER_MODULES = ['crawl_360.spiders']
15+
NEWSPIDER_MODULE = 'crawl_360.spiders'
16+
17+
18+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19+
#USER_AGENT = 'crawl_360 (+http://www.yourdomain.com)'
20+
21+
# Obey robots.txt rules
22+
ROBOTSTXT_OBEY = True
23+
24+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
25+
#CONCURRENT_REQUESTS = 32
26+
27+
# Configure a delay for requests for the same website (default: 0)
28+
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29+
# See also autothrottle settings and docs
30+
#DOWNLOAD_DELAY = 3
31+
# The download delay setting will honor only one of:
32+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
33+
#CONCURRENT_REQUESTS_PER_IP = 16
34+
35+
# Disable cookies (enabled by default)
36+
#COOKIES_ENABLED = False
37+
38+
# Disable Telnet Console (enabled by default)
39+
#TELNETCONSOLE_ENABLED = False
40+
41+
# Override the default request headers:
42+
#DEFAULT_REQUEST_HEADERS = {
43+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44+
# 'Accept-Language': 'en',
45+
#}
46+
47+
# Enable or disable spider middlewares
48+
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49+
#SPIDER_MIDDLEWARES = {
50+
# 'crawl_360.middlewares.Crawl360SpiderMiddleware': 543,
51+
#}
52+
53+
# Enable or disable downloader middlewares
54+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55+
#DOWNLOADER_MIDDLEWARES = {
56+
# 'crawl_360.middlewares.Crawl360DownloaderMiddleware': 543,
57+
#}
58+
59+
# Enable or disable extensions
60+
# See https://doc.scrapy.org/en/latest/topics/extensions.html
61+
#EXTENSIONS = {
62+
# 'scrapy.extensions.telnet.TelnetConsole': None,
63+
#}
64+
65+
# Configure item pipelines
66+
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67+
ITEM_PIPELINES = {
68+
'crawl_360.pipelines.Crawl360Pipeline': 300,
69+
}
70+
71+
# Enable and configure the AutoThrottle extension (disabled by default)
72+
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73+
#AUTOTHROTTLE_ENABLED = True
74+
# The initial download delay
75+
#AUTOTHROTTLE_START_DELAY = 5
76+
# The maximum download delay to be set in case of high latencies
77+
#AUTOTHROTTLE_MAX_DELAY = 60
78+
# The average number of requests Scrapy should be sending in parallel to
79+
# each remote server
80+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81+
# Enable showing throttling stats for every response received:
82+
#AUTOTHROTTLE_DEBUG = False
83+
84+
# Enable and configure HTTP caching (disabled by default)
85+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86+
#HTTPCACHE_ENABLED = True
87+
#HTTPCACHE_EXPIRATION_SECS = 0
88+
#HTTPCACHE_DIR = 'httpcache'
89+
#HTTPCACHE_IGNORE_HTTP_CODES = []
90+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

crawl_360/crawl_360/settings.pyc

372 Bytes
Binary file not shown.
+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
148 Bytes
Binary file not shown.

crawl_360/crawl_360/spiders/butian.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# -*- coding: utf-8 -*-
2+
import scrapy
3+
4+
from crawl_360.items import ButianItem
5+
import time
6+
7+
8+
class ButianSpider(scrapy.Spider):
9+
name = 'butian'
10+
allowed_domains = ['butian.360.cn/Loo', 'butian.360.cn']
11+
start_urls = ['http://butian.360.cn/Loo/']
12+
13+
def parse(self, response):
14+
self.logger.info('strart parse dst page ...')
15+
item = ButianItem()
16+
# import ipdb
17+
# ipdb.set_trace()
18+
for sel in response.xpath('//ul[@class="loopListBottom"]/li'):
19+
item['author'] = sel.xpath('dl/dd/span[1]/text()').extract_first(default='').strip()
20+
item['company_name'] = sel.xpath('dl/dd/a/text()').extract_first(default='').strip()
21+
item['vul_name'] = sel.xpath('dl/dd/span[3]/text()').extract_first(default='').replace(u'的一个', '').strip()
22+
item['vul_level'] = sel.xpath('dl/dd[2]/strong[@class="loopHigh"]/text()').extract_first(default='').strip()
23+
item['vul_money'] = sel.xpath('dl/p[@class="loopJiangjin"]/text()').extract_first(default=0)
24+
item['vul_find_time'] = sel.xpath('dl/dd[2]/em/text()').extract_first(default='').strip()
25+
item['link_url'] = response.url.strip()
26+
item['create_time'] = time.strftime("%Y-%m-%d %H:%M:%S")
27+
self.logger.info('find item data is:%s' % (item,))
28+
yield item
29+
30+
next_page = response.xpath(u'//div[@class="btPage page"]/a[contains(text(),"下一页")]/@href').extract_first()
31+
if next_page is not None:
32+
next_page = response.urljoin(next_page)
33+
self.logger.info('next page url is:%s' % (next_page,))
34+
yield scrapy.Request(url=next_page, callback=self.parse)
1.98 KB
Binary file not shown.

crawl_360/scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
5+
6+
[settings]
7+
default = crawl_360.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = crawl_360

0 commit comments

Comments
 (0)