Skip to content

Commit 6b381c5

Browse files
committed
init
0 parents  commit 6b381c5

17 files changed

+654
-0
lines changed

.gitignore

+132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
env.sh
2+
# Created by .ignore support plugin (hsz.mobi)
3+
### Python template
4+
# Byte-compiled / optimized / DLL files
5+
__pycache__/
6+
*.py[cod]
7+
*$py.class
8+
9+
# C extensions
10+
*.so
11+
12+
# Distribution / packaging
13+
.Python
14+
build/
15+
develop-eggs/
16+
dist/
17+
downloads/
18+
eggs/
19+
.eggs/
20+
lib/
21+
lib64/
22+
parts/
23+
sdist/
24+
var/
25+
wheels/
26+
pip-wheel-metadata/
27+
share/python-wheels/
28+
*.egg-info/
29+
.installed.cfg
30+
*.egg
31+
MANIFEST
32+
33+
# PyInstaller
34+
# Usually these files are written by a python script from a template
35+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
36+
*.manifest
37+
*.spec
38+
39+
# Installer logs
40+
pip-log.txt
41+
pip-delete-this-directory.txt
42+
43+
# Unit test / coverage reports
44+
htmlcov/
45+
.tox/
46+
.nox/
47+
.coverage
48+
.coverage.*
49+
.cache
50+
nosetests.xml
51+
coverage.xml
52+
*.cover
53+
*.py,cover
54+
.hypothesis/
55+
.pytest_cache/
56+
57+
# Translations
58+
*.mo
59+
*.pot
60+
61+
# Django stuff:
62+
*.log
63+
local_settings.py
64+
db.sqlite3
65+
db.sqlite3-journal
66+
67+
# Flask stuff:
68+
instance/
69+
.webassets-cache
70+
71+
# Scrapy stuff:
72+
.scrapy
73+
74+
# Sphinx documentation
75+
docs/_build/
76+
77+
# PyBuilder
78+
target/
79+
80+
# Jupyter Notebook
81+
.ipynb_checkpoints
82+
83+
# IPython
84+
profile_default/
85+
ipython_config.py
86+
87+
# pyenv
88+
.python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
98+
__pypackages__/
99+
100+
# Celery stuff
101+
celerybeat-schedule
102+
celerybeat.pid
103+
104+
# SageMath parsed files
105+
*.sage.py
106+
107+
# Environments
108+
.env
109+
.venv
110+
env/
111+
venv/
112+
ENV/
113+
env.bak/
114+
venv.bak/
115+
116+
# Spyder project settings
117+
.spyderproject
118+
.spyproject
119+
120+
# Rope project settings
121+
.ropeproject
122+
123+
# mkdocs documentation
124+
/site
125+
126+
# mypy
127+
.mypy_cache/
128+
.dmypy.json
129+
dmypy.json
130+
131+
# Pyre type checker
132+
.pyre/

Dockerfile

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
FROM python:3.6
2+
WORKDIR /app
3+
COPY requirements.txt .
4+
RUN pip3 install -r requirements.txt
5+
COPY run.sh .
6+
RUN sh run.sh
7+
ADD . .
8+
WORKDIR /app/spider
9+
CMD python3 run.py

deployment.yml

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
apiVersion: extensions/v1beta1
2+
kind: Deployment
3+
metadata:
4+
annotations:
5+
kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml
6+
kompose.version: 1.20.0 ()
7+
creationTimestamp: null
8+
labels:
9+
io.kompose.service: crawler-book-douban
10+
name: crawler-book-douban
11+
namespace: crawler
12+
spec:
13+
replicas: 5
14+
revisionHistoryLimit: 1
15+
strategy: {}
16+
template:
17+
metadata:
18+
annotations:
19+
kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml
20+
kompose.version: 1.20.0 ()
21+
creationTimestamp: null
22+
labels:
23+
io.kompose.service: crawler-book-douban
24+
spec:
25+
containers:
26+
- args:
27+
- python3
28+
- run.py
29+
env:
30+
- name: PGSQL_DATABASE
31+
value: scrape_book
32+
- name: PGSQL_HOST
33+
valueFrom:
34+
secretKeyRef:
35+
name: pgsql
36+
key: host
37+
- name: PGSQL_PASSWORD
38+
valueFrom:
39+
secretKeyRef:
40+
name: pgsql
41+
key: password
42+
- name: PGSQL_PORT
43+
valueFrom:
44+
secretKeyRef:
45+
name: pgsql
46+
key: port
47+
- name: PGSQL_USER
48+
valueFrom:
49+
secretKeyRef:
50+
name: pgsql
51+
key: user
52+
- name: REDIS_CONNECTION_STRING
53+
valueFrom:
54+
secretKeyRef:
55+
name: redis
56+
key: connection_string
57+
- name: PROXYPOOL_URL
58+
valueFrom:
59+
secretKeyRef:
60+
name: proxypool
61+
key: univeral
62+
- name: PROXYTUNNEL_URL
63+
valueFrom:
64+
secretKeyRef:
65+
name: proxytunnel
66+
key: value
67+
image: germey/crawler-book-douban:${TAG}
68+
name: crawler-book-douban
69+
resources:
70+
limits:
71+
memory: "200Mi"
72+
cpu: "150m"
73+
requests:
74+
memory: "200Mi"
75+
cpu: "150m"
76+
restartPolicy: Always
77+
status: {}

docker-compose.yml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
version: '3'
2+
services:
3+
crawler-book-douban:
4+
container_name: 'crawler-book-douban'
5+
restart: always
6+
build: .
7+
image: 'germey/crawler-book-douban'
8+
command: 'python3 run.py'
9+
environment:
10+
REDIS_CONNECTION_STRING:
11+
PROXYPOOL_URL:
12+
PROXYTUNNEL_URL:
13+
PGSQL_HOST:
14+
PGSQL_PORT:
15+
PGSQL_USER:
16+
PGSQL_PASSWORD:

requirements.txt

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
scrapy==1.6.0
2+
Django==2.2.9
3+
django-cors-headers==3.2.0
4+
djangorestframework==3.11.0
5+
requests==2.22.0
6+
urllib3==1.25.7
7+
uwsgi==2.0.18
8+
psycopg2-binary==2.8.4
9+
scrapy_djangoitem==1.1.1
10+
dateparser==0.7.2
11+
merry==0.2.2
12+
environs==7.2.0
13+
scrapy-redis==0.6.8

run.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
git clone https://github.com/Germey/Scrape.git
2+
cp -r Scrape/src/dynamic5/backend ./backend
3+
rm -rf Scrape

spider/__init__.py

Whitespace-only changes.

spider/book/__init__.py

Whitespace-only changes.

spider/book/items.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/items.html
7+
8+
from scrapy_djangoitem import DjangoItem
9+
from app.models import Book, Comment
10+
from scrapy.item import Field
11+
12+
class BookItem(DjangoItem):
13+
django_model = Book
14+
author_ids = Field()
15+
16+
class CommentItem(DjangoItem):
17+
django_model = Comment
18+
book_id = Field()

spider/book/middlewares.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import logging
2+
import requests
3+
4+
class ProxytunnelMiddleware(object):
5+
def __init__(self, proxytunnel_url):
6+
self.logger = logging.getLogger(__name__)
7+
self.proxytunnel_url = proxytunnel_url
8+
9+
def process_request(self, request, spider):
10+
"""
11+
if retry_times > 0,get random proxy
12+
:param request:
13+
:param spider:
14+
:return:
15+
"""
16+
if request.meta.get('retry_times') and 1 <= request.meta.get('retry_times') <= 10:
17+
self.logger.debug('Using proxytunnel')
18+
request.meta['proxy'] = self.proxytunnel_url
19+
20+
@classmethod
21+
def from_crawler(cls, crawler):
22+
settings = crawler.settings
23+
return cls(
24+
proxytunnel_url=settings.get('PROXYTUNNEL_URL')
25+
)
26+
27+
class ProxypoolMiddleware(object):
28+
"""
29+
proxy middleware for changing proxy
30+
"""
31+
32+
def __init__(self, proxypool_url):
33+
self.logger = logging.getLogger(__name__)
34+
self.proxypool_url = proxypool_url
35+
36+
def get_random_proxy(self):
37+
"""
38+
get random proxy form proxypol
39+
:return:
40+
"""
41+
try:
42+
response = requests.get(self.proxypool_url, timeout=5)
43+
if response.status_code == 200:
44+
proxy = response.text
45+
return proxy
46+
except requests.ConnectionError:
47+
return False
48+
49+
def process_request(self, request, spider):
50+
"""
51+
if retry_times > 0,get random proxy
52+
:param request:
53+
:param spider:
54+
:return:
55+
"""
56+
if request.meta.get('retry_times') and request.meta.get('retry_times') > 10:
57+
proxy = self.get_random_proxy()
58+
self.logger.debug('Get proxy %s', proxy)
59+
if proxy:
60+
uri = 'http://{proxy}'.format(proxy=proxy)
61+
self.logger.debug('Using proxy %s', proxy)
62+
request.meta['proxy'] = uri
63+
64+
@classmethod
65+
def from_crawler(cls, crawler):
66+
settings = crawler.settings
67+
return cls(
68+
proxypool_url=settings.get('PROXYPOOL_URL')
69+
)

spider/book/pipelines.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
from django.db import IntegrityError
8+
import logging
9+
from merry import Merry
10+
from spider.book.items import BookItem, CommentItem
11+
from app.models import Book, Comment
12+
13+
merry = Merry()
14+
logger = logging.getLogger(__name__)
15+
16+
class PgSQLPipeline():
17+
"""
18+
save data to postgresql
19+
"""
20+
21+
@merry._try
22+
def process_item(self, item, spider):
23+
merry.g.item = item
24+
logger.debug('Process item type %s', type(item))
25+
if isinstance(item, BookItem):
26+
item.instance.save()
27+
logger.info('Saved book %s', item.instance)
28+
29+
if isinstance(item, CommentItem):
30+
book_id = item.get('book_id')
31+
book, created = Book.objects.get_or_create(id=book_id)
32+
logger.info('Created book %s' if created else 'Book %s exists', book)
33+
item.instance.book = book
34+
item.instance.save()
35+
logger.info('Saved book %s', item.instance)
36+
37+
return item
38+
39+
@merry._except(IntegrityError)
40+
def process_integrity_error(self, e):
41+
item = merry.g.item
42+
logger.info('《%s》of %s already exists', item.get('title'), item.get('website'))

0 commit comments

Comments
 (0)