implemented new awesome collector, fixed some bugs

DevAlone · DevAlone · commit 23e16bb2213c · 2020-04-18T19:46:25.000Z
diff --git a/check_from_stdin.py b/check_from_stdin.py
@@ -31,13 +31,17 @@ async def check_task(ip, port):
                 break
         # if check_result:
         #     print('proxy {} works'.format(proxy_url))
-        print('+' if check_result else '-', end='')
+        print('+' if check_result else '-', end='', file=sys.stderr)
+        sys.stderr.flush()
 
 
 async def main():
     for line in sys.stdin:
         line = line.strip()
-        groups = re.search(proxy_find_regex, line).groups()
+        try:
+            groups = re.search(proxy_find_regex, line).groups()
+        except:
+            continue
         ip = '.'.join(groups[:4])
         port = groups[4]
 
diff --git a/checkers/base_checker.py b/checkers/base_checker.py
@@ -94,6 +94,7 @@ async def check(self, proxy_address: str, timeout: int = None) -> tuple:
                 raise OSError("Too many open files")
 
             if settings.DEBUG:
+                # TODO: move to logs!
                 print(
                     f"proxy {proxy_address} doesn't work because of exception {type(ex)}, message is {message}"
                 )
diff --git a/collectors/abstract_collector.py b/collectors/abstract_collector.py
@@ -31,7 +31,14 @@ async def collect(self):
 
         ip can be both ipv4 and ipv6
 
-        will support yield in the future, now just return list
+        return either list or async generator:
+
+        ::
+
+            >> async def collect(self):
+            >>     for proxy in something:
+            >>         yield proxy
+
         """
 
         return []
diff --git a/collectors/web/com/freeproxylists/collector.py b/collectors/web/com/freeproxylists/collector.py
@@ -0,0 +1,68 @@
+import asyncio
+import re
+
+from collectors import AbstractCollector
+from bs4 import BeautifulSoup
+
+import http_client
+
+SLEEP_BETWEEN_PAGES_SECONDS = 1
+
+
+class Collector(AbstractCollector):
+    __collector__ = True
+
+    def __init__(self):
+        super(Collector, self).__init__()
+        # it provides really a lot of proxies so we'll check it rarely
+        # 24 hours
+        self.processing_period = 24 * 3600
+        self.url = "http://freeproxylists.com"
+
+    async def collect(self):
+        html = await http_client.get_text(self.url)
+        soup = BeautifulSoup(html, features="lxml")
+
+        for link in soup.select("a"):
+            link = link["href"].strip()
+
+            if re.match(r"^/[a-zA-Z0-9_-]+\.html$", link):
+                async for proxy in self.collect_from_page(link):
+                    yield proxy
+
+    async def collect_from_page(self, page_link):
+        html = await http_client.get_text(self.url + page_link)
+
+        soup = BeautifulSoup(html, features="lxml")
+
+        for link in soup.select("a"):
+            link = link["href"].strip()
+
+            regex = r"^([a-zA-Z0-9_-]+)/([0-9]+)\.html$"
+            match = re.match(regex, link)
+
+            if match:
+                type_of_proxies, proxies_id = match.groups()
+                url = f"{self.url}/load_{type_of_proxies}_{proxies_id}.html"
+
+                async for proxy in self.collect_from_table(url):
+                    yield proxy
+
+    async def collect_from_table(self, table_url):
+        html = await http_client.get_text(table_url)
+
+        soup = BeautifulSoup(html, features="lxml")
+
+        table_text = soup.find("quote").contents[0]
+        soup = BeautifulSoup(table_text, features="lxml")
+
+        for tr in soup.find_all("tr"):
+            children = tr.find_all("td")
+            if len(children) != 2:
+                continue
+
+            ip, port = [child.contents[0] for child in children]
+            proxy = f"{ip}:{port}"
+            yield proxy
+
+        await asyncio.sleep(SLEEP_BETWEEN_PAGES_SECONDS)
diff --git a/docs/source/guides/how_to_create_collector.rst b/docs/source/guides/how_to_create_collector.rst
@@ -99,7 +99,7 @@ you can run proxy_py with `--test-collector` option:
 
 .. code-block:: bash
 
-    python3 main.py --test-collector collectors/web/cn/89ip/collector.py:Collector
+    python3 main.py core --test-collector collectors/web/cn/89ip/collector.py:Collector
 
 which means to take class Collector from the file `collectors/web/cn/89ip/collector.py`
 
diff --git a/http_client.py b/http_client.py
@@ -1,10 +1,9 @@
-import json
-
 from proxy_py import settings
 from fake_useragent import UserAgent
 from aiosocks.connector import ProxyConnector, ProxyClientRequest
 
 import aiohttp
+import json
 
 
 class HttpClientResult:
diff --git a/proxy_py/_settings.py b/proxy_py/_settings.py
@@ -63,12 +63,12 @@
 DEAD_PROXY_CHECKING_PERIOD = 1 * 24 * 60 * 60
 DO_NOT_CHECK_ON_N_BAD_CHECKS = DEAD_PROXY_THRESHOLD + 14
 # how many seconds to wait for response from proxy
-PROXY_CHECKING_TIMEOUT = 10
+PROXY_CHECKING_TIMEOUT = 30
 # do not check proxy from collector if it has been checked recently
 PROXY_NOT_CHECKING_PERIOD = 15 * 60
 # limiter for maximum number of proxies gotten from collector
 # to fix potential issue with collectors' spamming
-COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST = 16384
+COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST = 2 * 65536
 SLEEP_AFTER_ERROR_PERIOD = 10
 # how many collectors to process concurrently
 NUMBER_OF_CONCURRENT_COLLECTORS = 1
diff --git a/requirements.txt b/requirements.txt
@@ -15,3 +15,4 @@ pytest-asyncio
 termcolor
 uvloop
 geoip2
+bs4
diff --git a/tools/test_collector.py b/tools/test_collector.py
@@ -19,6 +19,9 @@ def eprint(*args, **kwargs):
     return print(*args, file=sys.stderr, **kwargs)
 
 
+PROXIES_PER_TIME = 8192
+
+
 async def run(path: str):
     path, class_name = path.split(':', maxsplit=2)
     path = re.sub(r"\.py$", "", path).replace('/', '.')
@@ -27,16 +30,28 @@ async def run(path: str):
     try:
         collector = collectors[path]
     except KeyError:
-        eprint("Collector doesn't exist")
+        eprint("Collector doesn't exist(maybe you forgot to set __collector__ to True)")
         return 1
 
-    result = list(await collector.collect())
+    total = 0
+    result = []
+
+    async for proxy in collector.collect():
+        total += 1
+        result.append(proxy)
 
-    print("Total number of proxies: {}".format(len(result)))
-    await asyncio.gather(*[process_proxy(proxy) for proxy in result])
+        if len(result) >= PROXIES_PER_TIME:
+            print(f"got more than {PROXIES_PER_TIME} proxies, checking this part")
+            # await asyncio.gather(*[process_proxy(proxy) for proxy in result])
+            result = []
+
+    # await asyncio.gather(*[process_proxy(proxy) for proxy in result])
+    print("Total number of proxies: {}".format(total))
 
 
 proxies_semaphore = asyncio.BoundedSemaphore(settings.NUMBER_OF_CONCURRENT_TASKS)
+
+
 async def process_proxy(proxy_url: str):
     async with proxies_semaphore:
         try:

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,7 @@ async def check(self, proxy_address: str, timeout: int = None) -> tuple:`
`94`	`94`	`raise OSError("Too many open files")`
`95`	`95`
`96`	`96`	`if settings.DEBUG:`
	`97`	`+ # TODO: move to logs!`
`97`	`98`	`print(`
`98`	`99`	`f"proxy {proxy_address} doesn't work because of exception {type(ex)}, message is {message}"`
`99`	`100`	`)`
-Original file line number
+Diff line change
 termcolor
 uvloop
 geoip2
 +bs4