Skip to content
This repository was archived by the owner on Oct 30, 2018. It is now read-only.

Commit 264937b

Browse files
committed
crawler function replaced
1 parent 0347653 commit 264937b

File tree

4 files changed

+64
-36
lines changed

4 files changed

+64
-36
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ SQLiv
2323
> - [bs4](https://pypi.python.org/pypi/bs4)
2424
> - [termcolor](https://pypi.python.org/pypi/termcolor)
2525
> - [google](https://pypi.python.org/pypi/google)
26+
> - [nyawc](https://pypi.python.org/pypi/nyawc/)
2627
2728
**Pre-installed Systems**
2829
- [BlackArch Linux](https://blackarch.org/scanner.html) ![BlackArch](https://raw.githubusercontent.com/BlackArch/blackarch-artwork/master/logo/logo-38-49.png)

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
bs4
22
termcolor
33
terminaltables
4+
nyawc

sqliv.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,20 @@
88

99
from src import std
1010
from src import scanner
11-
from src import crawler
1211
from src import reverseip
1312
from src import serverinfo
1413
from src.web import search
14+
from src.crawler import Crawler
1515

1616

1717
# search engine instance
1818
bing = search.Bing()
1919
google = search.Google()
2020
yahoo = search.Yahoo()
2121

22+
# crawler instance
23+
crawler = Crawler()
24+
2225

2326
def singlescan(url):
2427
"""instance to scan single targeted domain"""
@@ -39,7 +42,7 @@ def singlescan(url):
3942

4043
# crawl and scan the links
4144
# if crawl cannot find links, do some reverse domain
42-
std.stdout("crawling {}".format(url))
45+
std.stdout("going to crawl {}".format(url))
4346
urls = crawler.crawl(url)
4447

4548
if not urls:

src/crawler.py

+57-34
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,60 @@
11
import re
22
from urlparse import urlparse
33

4-
from web import web
5-
6-
def parameterControl(URL):
7-
for site in links:
8-
if URL.split("=")[0] in site:
9-
return False
10-
11-
return True
12-
13-
def crawl(url):
14-
"""crawl the links of the same given domain"""
15-
global links
16-
17-
links = []
18-
19-
try:
20-
result, URL = web.gethtml(url, lastURL=True)
21-
except:
22-
return None
23-
24-
if result:
25-
# get only domain name
26-
domain = 'http://' + '/'.join(URL.split('/')[2:-1]) + '/' if len(URL.split('/')) >= 4 else URL.rstrip('/') + '/'
27-
28-
for link in re.findall('<a href="(.*?)"', result):
29-
# www.example.com/index.(php|aspx|jsp)?query=1
30-
if re.search('(.*?)(.php\?|.asp\?|.apsx\?|.jsp\?)(.*?)=(.*?)', link):
31-
if parameterControl(link) == True:
32-
if link.startswith(("http", "www")) or domain in urlparse(link).path:
33-
links.append(link)
34-
else:
35-
links.append(domain + link if link.startswith("/") else domain + link)
36-
37-
return links
4+
#import std
5+
from nyawc.Options import Options
6+
from nyawc.QueueItem import QueueItem
7+
from nyawc.Crawler import Crawler as nyawcCrawler
8+
from nyawc.CrawlerActions import CrawlerActions
9+
from nyawc.http.Request import Request
10+
11+
12+
class Crawler:
13+
def __init__(self):
14+
self.links = []
15+
self.crawler = None
16+
self.setoptions()
17+
18+
def crawl(self, url):
19+
if self.crawler is None:
20+
print "Cralwer is not setted up"
21+
return
22+
23+
parsedurl = urlparse(url)
24+
domain = parsedurl.scheme + "://" + parsedurl.netloc
25+
26+
self.links = []
27+
self.crawler.start_with(Request(domain))
28+
return self.links
29+
30+
def setoptions(self, depth=1):
31+
"""Define how far user want to crawl"""
32+
33+
options = Options()
34+
options.scope.max_depth = depth
35+
options.callbacks.crawler_before_start = self.crawlerstart
36+
options.callbacks.crawler_after_finish = self.crawlerfinish
37+
options.callbacks.request_before_start = self.requeststart
38+
options.callbacks.request_after_finish = self.requestfinish
39+
40+
self.crawler = nyawcCrawler(options)
41+
42+
def crawlerstart(self):
43+
# Called before the crawler starts crawling. Default is a null route.
44+
pass
45+
46+
def crawlerfinish(self, queue):
47+
# Called after the crawler finished crawling. Default is a null route.
48+
pass
49+
50+
def requeststart(self, queue, queue_item):
51+
# Called before the crawler starts a new request. Default is a null route.
52+
return CrawlerActions.DO_CONTINUE_CRAWLING
53+
54+
def requestfinish(self, queue, queue_item, new_queue_items):
55+
# Called after the crawler finishes a request. Default is a null route.
56+
url = queue_item.request.url
57+
if re.search('(.*?)(.php\?|.asp\?|.apsx\?|.jsp\?)(.*?)=(.*?)', url):
58+
if not url in self.links:
59+
self.links.append(url)
60+
return CrawlerActions.DO_CONTINUE_CRAWLING

0 commit comments

Comments
 (0)