Skip to content

Commit 96628a3

Browse files
committed
feat: add tool to detect down websites
1 parent 34ac478 commit 96628a3

File tree

4 files changed

+286
-4
lines changed

4 files changed

+286
-4
lines changed

.github/workflows/python-app.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ on:
55
branches: [ main ]
66
paths:
77
- 'data/**'
8-
- 'src/**'
8+
- 'src/generate.py'
99
- '.github/workflows/**'
10-
workflow_dispatch:
10+
workflow_dispatch:
1111

1212
jobs:
1313
build:
@@ -20,12 +20,12 @@ jobs:
2020
uses: actions/setup-python@v2
2121
with:
2222
python-version: "3.10"
23-
- name: Clear dist
23+
- name: Clear dist
2424
# in case of a malicious commit/pull request that modify manually ./dist/
2525
run: rm -rf ./dist
2626
- name: Generate dist
2727
run: python ./src/generate.py
28-
28+
2929
- name: commit files
3030
run: |
3131
git config --local user.email "[email protected]"

src/clean_data/.gitignore

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
*.pkl
2+
clean-report.md
3+
4+
# Byte-compiled / optimized / DLL files
5+
__pycache__/
6+
*.py[cod]
7+
*$py.class
8+
9+
# C extensions
10+
*.so
11+
12+
# Distribution / packaging
13+
.Python
14+
build/
15+
develop-eggs/
16+
dist/
17+
downloads/
18+
eggs/
19+
.eggs/
20+
lib/
21+
lib64/
22+
parts/
23+
sdist/
24+
var/
25+
wheels/
26+
share/python-wheels/
27+
*.egg-info/
28+
.installed.cfg
29+
*.egg
30+
MANIFEST
31+
32+
# PyInstaller
33+
# Usually these files are written by a python script from a template
34+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
35+
*.manifest
36+
*.spec
37+
38+
# Installer logs
39+
pip-log.txt
40+
pip-delete-this-directory.txt
41+
42+
# Unit test / coverage reports
43+
htmlcov/
44+
.tox/
45+
.nox/
46+
.coverage
47+
.coverage.*
48+
.cache
49+
nosetests.xml
50+
coverage.xml
51+
*.cover
52+
*.py,cover
53+
.hypothesis/
54+
.pytest_cache/
55+
cover/
56+
57+
# Translations
58+
*.mo
59+
*.pot
60+
61+
# Django stuff:
62+
*.log
63+
local_settings.py
64+
db.sqlite3
65+
db.sqlite3-journal
66+
67+
# Flask stuff:
68+
instance/
69+
.webassets-cache
70+
71+
# Scrapy stuff:
72+
.scrapy
73+
74+
# Sphinx documentation
75+
docs/_build/
76+
77+
# PyBuilder
78+
.pybuilder/
79+
target/
80+
81+
# Jupyter Notebook
82+
.ipynb_checkpoints
83+
84+
# IPython
85+
profile_default/
86+
ipython_config.py
87+
88+
# pyenv
89+
# For a library or package, you might want to ignore these files since the code is
90+
# intended to run in multiple environments; otherwise, check them in:
91+
# .python-version
92+
93+
# pipenv
94+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
96+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
97+
# install all needed dependencies.
98+
#Pipfile.lock
99+
100+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
101+
__pypackages__/
102+
103+
# Celery stuff
104+
celerybeat-schedule
105+
celerybeat.pid
106+
107+
# SageMath parsed files
108+
*.sage.py
109+
110+
# Environments
111+
.env
112+
.venv
113+
env/
114+
venv/
115+
ENV/
116+
env.bak/
117+
venv.bak/
118+
119+
# Spyder project settings
120+
.spyderproject
121+
.spyproject
122+
123+
# Rope project settings
124+
.ropeproject
125+
126+
# mkdocs documentation
127+
/site
128+
129+
# mypy
130+
.mypy_cache/
131+
.dmypy.json
132+
dmypy.json
133+
134+
# Pyre type checker
135+
.pyre/
136+
137+
# pytype static type analyzer
138+
.pytype/
139+
140+
# Cython debug symbols
141+
cython_debug/

src/clean_data/main.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
from collections import defaultdict
2+
from pathlib import Path
3+
from typing import Any, NamedTuple
4+
from dns.resolver import Resolver, NoAnswer, NXDOMAIN, LifetimeTimeout, NoNameservers
5+
from dns.rdatatype import RdataType
6+
from time import sleep
7+
8+
import logging
9+
10+
NEW_LINE = "\n"
11+
DNS_SERVER = ["1.1.1.1"]
12+
DNS_SLEEP = 0.1
13+
14+
################################################
15+
#
16+
# DNS helpers
17+
#
18+
################################################
19+
20+
def domain_has_ip(resolver, domain):
21+
""" Return true if the domain has at least one IP (IPv4 or IPv6)"""
22+
len_dns_a = 0
23+
len_dns_aaaa = 0
24+
try:
25+
dns_response = resolver.resolve(domain, RdataType.A)
26+
len_dns_a = len(dns_response.rrset)
27+
except (NoAnswer, NXDOMAIN, LifetimeTimeout, NoNameservers) as e:
28+
# No response for this domain
29+
pass
30+
31+
try:
32+
dns_response = resolver.resolve(domain, RdataType.AAAA)
33+
len_dns_aaaa = len(dns_response.rrset)
34+
except (NoAnswer, NXDOMAIN, LifetimeTimeout, NoNameservers) as e:
35+
# No response for this domain
36+
pass
37+
38+
return len_dns_a + len_dns_aaaa > 0
39+
40+
41+
################################################
42+
#
43+
# DEBUG
44+
#
45+
################################################
46+
47+
# For offline debugging
48+
STUB_DNS = False
49+
50+
if STUB_DNS:
51+
def domain_has_ip(*args, **kwargs):
52+
from random import random
53+
return random() < 0.9
54+
55+
################################################
56+
#
57+
# Markdown helpers
58+
#p
59+
################################################
60+
61+
def md_link(content: str, href: str):
62+
return f"[{content}]({href})"
63+
64+
def md_tr(*td: str):
65+
return "|".join(("", *td, "")) + NEW_LINE
66+
67+
68+
################################################
69+
#
70+
# Main
71+
#
72+
################################################
73+
74+
class CleanResult(NamedTuple):
75+
url_filter: str
76+
domain: str
77+
has_ip: bool
78+
79+
80+
def main():
81+
root_path = Path(__file__).parent.joinpath("../../").resolve()
82+
report_path = root_path.joinpath("src", "clean_data", "clean-report.md")
83+
84+
resolver = Resolver(configure=False)
85+
resolver.nameservers = DNS_SERVER
86+
87+
clean_result_per_file = defaultdict(list)
88+
89+
for source_f in sorted(root_path.joinpath("data").glob("np*.txt")):
90+
with source_f.open("r") as source_fd:
91+
for line in source_fd:
92+
if line.startswith("!") or not line.strip():
93+
continue
94+
url_filter = line.strip()
95+
domain = url_filter.replace("*://", "").split("/", 1)[0]
96+
97+
url_list = []
98+
if domain.startswith("*."):
99+
url_list.append(domain.replace("*.", "www."))
100+
domain = domain.replace("*.", "")
101+
url_list.append(domain)
102+
103+
logging.info("Try resolve %s", domain)
104+
105+
this_domain_has_ip = any(domain_has_ip(resolver, u) for u in url_list)
106+
clean_result_per_file[source_f.name].append(CleanResult(
107+
url_filter,
108+
domain,
109+
this_domain_has_ip
110+
))
111+
112+
sleep(DNS_SLEEP)
113+
114+
# Delete old report
115+
report_path.unlink(missing_ok=True)
116+
117+
with report_path.open("w", encoding="utf8") as report_fd:
118+
report_fd.write(f"# Data cleaning report" + NEW_LINE + NEW_LINE)
119+
120+
for file, clean_result_list in clean_result_per_file.items():
121+
report_fd.write(f"## Domains in `{file}`" + NEW_LINE*2)
122+
123+
report_fd.write(md_tr("domain", "has_ip", "Google site:", "DDG site:"))
124+
report_fd.write(md_tr("---", ":---:", "---", "---"))
125+
126+
clean_result: CleanResult
127+
for clean_result in clean_result_list:
128+
report_fd.write(md_tr(
129+
md_link(clean_result.url_filter, f"//{clean_result.domain}"),
130+
"" if clean_result.has_ip else "❌",
131+
md_link("Search 🔎", f"https://www.google.com/search?q=site%3A{clean_result.domain}"),
132+
md_link("Search 🔎", f"https://duckduckgo.com/?q=site%3A{clean_result.domain}")
133+
))
134+
135+
136+
if __name__ == "__main__":
137+
logging.basicConfig()
138+
logging.getLogger().setLevel(logging.INFO)
139+
140+
main()

src/clean_data/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
dnspython>=2.2

0 commit comments

Comments
 (0)