Skip to content

Commit 3702ae5

Browse files
committed
Refactor repository.
1 parent f310f2a commit 3702ae5

18 files changed

+832
-344
lines changed

.env.example

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
TARGET_URL=https://hackersandslackers.com

.flake8

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[flake8]
2+
select = E9,F63,F7,F82
3+
exclude = .git,.github,__pycache__,.pytest_cache,.venv,logs,creds,.reports
4+
max-line-length = 120

.gitignore

+5-3
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,10 @@ venv.bak/
106106
credentials.json
107107
gcloud.json
108108

109-
# Etc.
110-
.idea
111-
.pytest_cache
109+
# OS
112110
.DS_Store
113111

112+
# IDEs
113+
.idea
114+
.vs_code
115+

LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2020 Hackers and Slackers
3+
Copyright (c) 2024 Hackers and Slackers
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

Makefile

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
PROJECT_NAME := $(shell basename $CURDIR)
2+
VIRTUAL_ENV := $(CURDIR)/.venv
3+
LOCAL_PYTHON := $(VIRTUAL_ENV)/bin/python3
4+
5+
define HELP
6+
Manage $(PROJECT_NAME). Usage:
7+
8+
make run - Run $(PROJECT_NAME) locally.
9+
make install - Create local virtualenv & install dependencies.
10+
make deploy - Set up project & run locally.
11+
make update - Update dependencies via Poetry and output resulting `requirements.txt`.
12+
make format - Run Python code formatter & sort dependencies.
13+
make lint - Check code formatting with flake8.
14+
make clean - Remove extraneous compiled files, caches, logs, etc.
15+
16+
endef
17+
export HELP
18+
19+
20+
.PHONY: run install deploy update format lint clean help
21+
22+
all help:
23+
@echo "$$HELP"
24+
25+
env: $(VIRTUAL_ENV)
26+
27+
$(VIRTUAL_ENV):
28+
if [ ! -d $(VIRTUAL_ENV) ]; then \
29+
echo "Creating Python virtual env in \`${VIRTUAL_ENV}\`"; \
30+
python3 -m venv $(VIRTUAL_ENV); \
31+
fi
32+
poetry config virtualenvs.path $(VIRTUAL_ENV)
33+
34+
.PHONY: run
35+
run: env
36+
$(LOCAL_PYTHON) -m main
37+
38+
.PHONY: install
39+
install: env
40+
$(shell . $(VIRTUAL_ENV)/bin/activate)
41+
$(LOCAL_PYTHON) -m pip install --upgrade pip setuptools wheel && \
42+
poetry install --with dev --sync
43+
echo Installed dependencies in \`${VIRTUAL_ENV}\`;
44+
45+
.PHONY: deploy
46+
deploy:
47+
make install && \
48+
make run
49+
50+
.PHONY: update
51+
update: env
52+
$(LOCAL_PYTHON) -m pip install --upgrade pip setuptools wheel && \
53+
poetry update --with dev && \
54+
poetry export -f requirements.txt --output requirements.txt --without-hashes && \
55+
echo Installed dependencies in \`${VIRTUAL_ENV}\`;
56+
57+
.PHONY: format
58+
format: env
59+
$(LOCAL_PYTHON) -m isort --multi-line=3 . && \
60+
$(LOCAL_PYTHON) -m black .
61+
62+
.PHONY: lint
63+
lint: env
64+
$(LOCAL_PYTHON) -m flake8 . --count \
65+
--select=E9,F63,F7,F82 \
66+
--exclude .git,.github,__pycache__,.pytest_cache,.venv,logs,creds,.venv,docs,logs,.reports \
67+
--show-source \
68+
--statistics
69+
70+
.PHONY: clean
71+
clean:
72+
find . -name 'poetry.lock' -delete && \
73+
find . -name '.coverage' -delete && \
74+
find . -name '.Pipfile.lock' -delete && \
75+
find . -wholename '**/*.pyc' -delete && \
76+
find . -type d -wholename '__pycache__' -exec rm -rf {} + && \
77+
find . -type d -wholename './.venv' -exec rm -rf {} + && \
78+
find . -type d -wholename '.pytest_cache' -exec rm -rf {} + && \
79+
find . -type d -wholename '**/.pytest_cache' -exec rm -rf {} + && \
80+
find . -type d -wholename './logs/*.log' -exec rm -rf {} + && \
81+
find . -type d -wholename './.reports/*' -exec rm -rf {} +

Pipfile

-14
This file was deleted.

Pipfile.lock

-135
This file was deleted.

README.md

+17-33
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,38 @@
11
# BeautifulSoup Web Scraping Tutorial
22

3-
![Python](https://img.shields.io/badge/Python-v^3.8-blue.svg?logo=python&longCache=true&logoColor=white&colorB=5e81ac&style=flat-square&colorA=4c566a)
4-
![BeautifulSoup](https://img.shields.io/badge/BeautifulSoup4-v4.9.1-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
5-
![Requests](https://img.shields.io/badge/Requests-v2.23.0-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
3+
![Python](https://img.shields.io/badge/Python-v^3.10-blue.svg?logo=python&longCache=true&logoColor=white&colorB=5e81ac&style=flat-square&colorA=4c566a)
4+
![BeautifulSoup](https://img.shields.io/badge/BeautifulSoup4-v4.12.2-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
5+
![Requests](https://img.shields.io/badge/Requests-v2.31.0-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
66
![GitHub Last Commit](https://img.shields.io/github/last-commit/google/skia.svg?style=flat-square&colorA=4c566a&colorB=a3be8c)
77
[![GitHub Issues](https://img.shields.io/github/issues/hackersandslackers/beautifulsoup-tutorial.svg?style=flat-square&colorA=4c566a&colorB=ebcb8b&logo=Github)](https://github.com/hackersandslackers/beautifulsoup-tutorial/issues)
88
[![GitHub Stars](https://img.shields.io/github/stars/hackersandslackers/beautifulsoup-tutorial.svg?style=flat-square&colorB=ebcb8b&colorA=4c566a&logo=Github)](https://github.com/hackersandslackers/beautifulsoup-tutorial/stargazers)
99
[![GitHub Forks](https://img.shields.io/github/forks/hackersandslackers/beautifulsoup-tutorial.svg?style=flat-square&colorA=4c566a&colorB=ebcb8b&logo=Github)](https://github.com/hackersandslackers/beautifulsoup-tutorial/network)
1010

1111
![Beautifulsoup Tutorial](https://github.com/hackersandslackers/beautifulsoup-tutorial/blob/master/.github/[email protected]?raw=true)
1212

13-
A beginner's tutorial to scraping websites using Python's [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) library. This repository is the source code for the tutorial found here:
14-
https://hackersandslackers.com/scraping-urls-with-beautifulsoup/
13+
A beginner's tutorial to scraping websites using Python's [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) library.
1514

16-
## Installation
15+
This repository is the source code for the tutorial found [here](https://hackersandslackers.com/beautifulsoup-tutorial/).
1716

18-
**Installation via `requirements.txt`**:
17+
## Getting Started
1918

20-
```shell
21-
$ git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
22-
$ cd beautifulsoup-tutorial
23-
$ python3 -m venv myenv
24-
$ source myenv/bin/activate
25-
$ pip3 install -r requirements.txt
26-
$ python3 main.py
27-
```
19+
Get set up locally in two steps:
2820

29-
**Installation via [Pipenv](https://pipenv-fork.readthedocs.io/en/latest/)**:
21+
### Environment Variables
3022

31-
```shell
32-
$ git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
33-
$ cd beautifulsoup-tutorial
34-
$ pipenv shell
35-
$ pipenv update
36-
$ python3 main.py
37-
```
23+
Replace the value in **.env.example** with your value, and rename this file to **.env**:
3824

39-
**Installation via [Poetry](https://python-poetry.org/)**:
25+
* `TARGET_URL`: An HTTP URL to scrape and display metadata from.
4026

41-
```shell
42-
$ git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
43-
$ cd beautifulsoup-tutorial
44-
$ poetry shell
45-
$ poetry update
46-
$ poetry run
47-
```
27+
### Installation
4828

49-
## How to Use
29+
Get up and running with `make deploy`:
5030

51-
This script will output metadata scraped from whichever URL is specified in **config.py**. Simply change the value of this variable to test the script against any URL of your choice.
31+
```shell
32+
git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
33+
cd beautifulsoup-tutorial
34+
make deploy
35+
```
5236

5337
------------------
5438

beautifulsoup_tutorial/__init__.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""Scrape metadata from target URL."""
2+
import pprint
3+
4+
from beautifulsoup_tutorial.fetch import fetch_html_from_url
5+
from beautifulsoup_tutorial.scrape import scrape_page_metadata
6+
7+
from config import TARGET_URL
8+
9+
10+
def init_script() -> dict:
11+
"""
12+
Fetch a given HTML page to extract & display metadata for.
13+
14+
returns: dict
15+
"""
16+
resp = fetch_html_from_url(TARGET_URL)
17+
metadata = scrape_page_metadata(resp, TARGET_URL)
18+
pp = pprint.PrettyPrinter(indent=4, width=120, sort_dicts=False)
19+
pp.pprint(metadata)
20+
return metadata

beautifulsoup_tutorial/fetch.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""Fetch raw HTML from a URL."""
2+
from typing import Optional
3+
4+
import requests
5+
from requests.exceptions import HTTPError
6+
7+
8+
def fetch_html_from_url(url: str) -> Optional[str]:
9+
"""
10+
Fetch raw HTML from a URL.
11+
12+
:param str url: URL to `GET` contents from.
13+
14+
:return: Optional[str]
15+
"""
16+
try:
17+
headers = {
18+
"Access-Control-Allow-Origin": "*",
19+
"Access-Control-Allow-Methods": "GET",
20+
"Access-Control-Allow-Headers": "Content-Type",
21+
"Access-Control-Max-Age": "3600",
22+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
23+
}
24+
return requests.get(url, headers=headers)
25+
except HTTPError as e:
26+
print(f"HTTP error occurred: {e}")
27+
except Exception as e:
28+
print(f"Unexpected error occurred: {e}")

0 commit comments

Comments
 (0)