Skip to content

Commit 23e16bb

Browse files
committed
implemented new awesome collector, fixed some bugs
1 parent a364585 commit 23e16bb

File tree

9 files changed

+107
-12
lines changed

9 files changed

+107
-12
lines changed

check_from_stdin.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,17 @@ async def check_task(ip, port):
3131
break
3232
# if check_result:
3333
# print('proxy {} works'.format(proxy_url))
34-
print('+' if check_result else '-', end='')
34+
print('+' if check_result else '-', end='', file=sys.stderr)
35+
sys.stderr.flush()
3536

3637

3738
async def main():
3839
for line in sys.stdin:
3940
line = line.strip()
40-
groups = re.search(proxy_find_regex, line).groups()
41+
try:
42+
groups = re.search(proxy_find_regex, line).groups()
43+
except:
44+
continue
4145
ip = '.'.join(groups[:4])
4246
port = groups[4]
4347

checkers/base_checker.py

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ async def check(self, proxy_address: str, timeout: int = None) -> tuple:
9494
raise OSError("Too many open files")
9595

9696
if settings.DEBUG:
97+
# TODO: move to logs!
9798
print(
9899
f"proxy {proxy_address} doesn't work because of exception {type(ex)}, message is {message}"
99100
)

collectors/abstract_collector.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,14 @@ async def collect(self):
3131
3232
ip can be both ipv4 and ipv6
3333
34-
will support yield in the future, now just return list
34+
return either list or async generator:
35+
36+
::
37+
38+
>> async def collect(self):
39+
>> for proxy in something:
40+
>> yield proxy
41+
3542
"""
3643

3744
return []
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import asyncio
2+
import re
3+
4+
from collectors import AbstractCollector
5+
from bs4 import BeautifulSoup
6+
7+
import http_client
8+
9+
SLEEP_BETWEEN_PAGES_SECONDS = 1
10+
11+
12+
class Collector(AbstractCollector):
13+
__collector__ = True
14+
15+
def __init__(self):
16+
super(Collector, self).__init__()
17+
# it provides really a lot of proxies so we'll check it rarely
18+
# 24 hours
19+
self.processing_period = 24 * 3600
20+
self.url = "http://freeproxylists.com"
21+
22+
async def collect(self):
23+
html = await http_client.get_text(self.url)
24+
soup = BeautifulSoup(html, features="lxml")
25+
26+
for link in soup.select("a"):
27+
link = link["href"].strip()
28+
29+
if re.match(r"^/[a-zA-Z0-9_-]+\.html$", link):
30+
async for proxy in self.collect_from_page(link):
31+
yield proxy
32+
33+
async def collect_from_page(self, page_link):
34+
html = await http_client.get_text(self.url + page_link)
35+
36+
soup = BeautifulSoup(html, features="lxml")
37+
38+
for link in soup.select("a"):
39+
link = link["href"].strip()
40+
41+
regex = r"^([a-zA-Z0-9_-]+)/([0-9]+)\.html$"
42+
match = re.match(regex, link)
43+
44+
if match:
45+
type_of_proxies, proxies_id = match.groups()
46+
url = f"{self.url}/load_{type_of_proxies}_{proxies_id}.html"
47+
48+
async for proxy in self.collect_from_table(url):
49+
yield proxy
50+
51+
async def collect_from_table(self, table_url):
52+
html = await http_client.get_text(table_url)
53+
54+
soup = BeautifulSoup(html, features="lxml")
55+
56+
table_text = soup.find("quote").contents[0]
57+
soup = BeautifulSoup(table_text, features="lxml")
58+
59+
for tr in soup.find_all("tr"):
60+
children = tr.find_all("td")
61+
if len(children) != 2:
62+
continue
63+
64+
ip, port = [child.contents[0] for child in children]
65+
proxy = f"{ip}:{port}"
66+
yield proxy
67+
68+
await asyncio.sleep(SLEEP_BETWEEN_PAGES_SECONDS)

docs/source/guides/how_to_create_collector.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ you can run proxy_py with `--test-collector` option:
9999

100100
.. code-block:: bash
101101
102-
python3 main.py --test-collector collectors/web/cn/89ip/collector.py:Collector
102+
python3 main.py core --test-collector collectors/web/cn/89ip/collector.py:Collector
103103
104104
which means to take class Collector from the file `collectors/web/cn/89ip/collector.py`
105105

http_client.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
import json
2-
31
from proxy_py import settings
42
from fake_useragent import UserAgent
53
from aiosocks.connector import ProxyConnector, ProxyClientRequest
64

75
import aiohttp
6+
import json
87

98

109
class HttpClientResult:

proxy_py/_settings.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,12 @@
6363
DEAD_PROXY_CHECKING_PERIOD = 1 * 24 * 60 * 60
6464
DO_NOT_CHECK_ON_N_BAD_CHECKS = DEAD_PROXY_THRESHOLD + 14
6565
# how many seconds to wait for response from proxy
66-
PROXY_CHECKING_TIMEOUT = 10
66+
PROXY_CHECKING_TIMEOUT = 30
6767
# do not check proxy from collector if it has been checked recently
6868
PROXY_NOT_CHECKING_PERIOD = 15 * 60
6969
# limiter for maximum number of proxies gotten from collector
7070
# to fix potential issue with collectors' spamming
71-
COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST = 16384
71+
COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST = 2 * 65536
7272
SLEEP_AFTER_ERROR_PERIOD = 10
7373
# how many collectors to process concurrently
7474
NUMBER_OF_CONCURRENT_COLLECTORS = 1

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ pytest-asyncio
1515
termcolor
1616
uvloop
1717
geoip2
18+
bs4

tools/test_collector.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ def eprint(*args, **kwargs):
1919
return print(*args, file=sys.stderr, **kwargs)
2020

2121

22+
PROXIES_PER_TIME = 8192
23+
24+
2225
async def run(path: str):
2326
path, class_name = path.split(':', maxsplit=2)
2427
path = re.sub(r"\.py$", "", path).replace('/', '.')
@@ -27,16 +30,28 @@ async def run(path: str):
2730
try:
2831
collector = collectors[path]
2932
except KeyError:
30-
eprint("Collector doesn't exist")
33+
eprint("Collector doesn't exist(maybe you forgot to set __collector__ to True)")
3134
return 1
3235

33-
result = list(await collector.collect())
36+
total = 0
37+
result = []
38+
39+
async for proxy in collector.collect():
40+
total += 1
41+
result.append(proxy)
3442

35-
print("Total number of proxies: {}".format(len(result)))
36-
await asyncio.gather(*[process_proxy(proxy) for proxy in result])
43+
if len(result) >= PROXIES_PER_TIME:
44+
print(f"got more than {PROXIES_PER_TIME} proxies, checking this part")
45+
# await asyncio.gather(*[process_proxy(proxy) for proxy in result])
46+
result = []
47+
48+
# await asyncio.gather(*[process_proxy(proxy) for proxy in result])
49+
print("Total number of proxies: {}".format(total))
3750

3851

3952
proxies_semaphore = asyncio.BoundedSemaphore(settings.NUMBER_OF_CONCURRENT_TASKS)
53+
54+
4055
async def process_proxy(proxy_url: str):
4156
async with proxies_semaphore:
4257
try:

0 commit comments

Comments
 (0)