-
Notifications
You must be signed in to change notification settings - Fork 170
/
Copy pathpdf_download.py
51 lines (44 loc) · 1.34 KB
/
pdf_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from requests import get
from urllib.parse import urljoin
from os import path, getcwd
import os
from bs4 import BeautifulSoup as soup
from sys import argv
import os,sys
def get_page(base_url):
req= get(base_url)
if req.status_code==200:
return req.text
raise Exception('Error {0}'.format(req.status_code))
def get_all_links(html):
bs= soup(html,"html.parser")
links= bs.findAll('a')
print(links)
return links
def get_pdf(base_url, base_dir):
html= get_page(base_url)
links= get_all_links(html)
if len(links)==0:
raise Exception('No links found on the webpage')
n_pdfs= 0
os.chdir('output_pdfs')
for link in links:
print(link)
break
if link['href'][-4:]=='.pdf':
n_pdfs+= 1
content= get(urljoin(base_url, link['href']))
#if content.status==200 and content.headers['content-type']=='application/pdf':
with open(link.text+'.pdf', 'wb') as pdf:
pdf.write(content.content)
if n_pdfs==0:
raise Exception('No pdfs found on the page')
#print "{0} pdfs downloaded and saved in {1}".format(n_pdfs, base_dir)
if __name__ == "__main__":
if(len(sys.argv)!=2):
print("Invalid number of Arguments")
raise SystemExit
else:
url_path = sys.argv[1]
output_path=os.getcwd()
get_pdf(url_path,output_path)