crawler function replaced

the-robot · the-robot · commit 264937b9e9fd · 2017-11-02T18:29:26.000+08:00
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ SQLiv
 > - [bs4](https://pypi.python.org/pypi/bs4)  
 > - [termcolor](https://pypi.python.org/pypi/termcolor)  
 > - [google](https://pypi.python.org/pypi/google)
+> - [nyawc](https://pypi.python.org/pypi/nyawc/)
 
 **Pre-installed Systems**  
 - [BlackArch Linux](https://blackarch.org/scanner.html) ![BlackArch](https://raw.githubusercontent.com/BlackArch/blackarch-artwork/master/logo/logo-38-49.png)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 bs4
 termcolor
 terminaltables
+nyawc
diff --git a/sqliv.py b/sqliv.py
@@ -8,17 +8,20 @@
 
 from src import std
 from src import scanner
-from src import crawler
 from src import reverseip
 from src import serverinfo
 from src.web import search
+from src.crawler import Crawler
 
 
 # search engine instance
 bing   = search.Bing()
 google = search.Google()
 yahoo = search.Yahoo()
 
+# crawler instance
+crawler = Crawler()
+
 
 def singlescan(url):
     """instance to scan single targeted domain"""
@@ -39,7 +42,7 @@ def singlescan(url):
 
     # crawl and scan the links
     # if crawl cannot find links, do some reverse domain
-    std.stdout("crawling {}".format(url))
+    std.stdout("going to crawl {}".format(url))
     urls = crawler.crawl(url)
 
     if not urls:
diff --git a/src/crawler.py b/src/crawler.py
@@ -1,37 +1,60 @@
 import re
 from urlparse import urlparse
 
-from web import web
-
-def parameterControl(URL):
-    for site in links:
-        if URL.split("=")[0] in site:
-            return False
-
-    return True
-
-def crawl(url):
-    """crawl the links of the same given domain"""
-    global links
-
-    links = []
-
-    try:
-        result, URL = web.gethtml(url, lastURL=True)
-    except:
-        return None
-
-    if result:
-        # get only domain name
-        domain = 'http://' + '/'.join(URL.split('/')[2:-1]) + '/' if len(URL.split('/')) >= 4 else URL.rstrip('/') + '/'
-
-        for link in re.findall('<a href="(.*?)"', result):
-            # www.example.com/index.(php|aspx|jsp)?query=1
-            if re.search('(.*?)(.php\?|.asp\?|.apsx\?|.jsp\?)(.*?)=(.*?)', link):
-                if parameterControl(link) == True:
-                    if link.startswith(("http", "www")) or domain in urlparse(link).path:
-                        links.append(link)
-                    else:
-                        links.append(domain + link if link.startswith("/") else domain + link)
-
-    return links
+#import std
+from nyawc.Options import Options
+from nyawc.QueueItem import QueueItem
+from nyawc.Crawler import Crawler as nyawcCrawler
+from nyawc.CrawlerActions import CrawlerActions
+from nyawc.http.Request import Request
+
+
+class Crawler:
+    def __init__(self):
+        self.links = []
+        self.crawler = None
+        self.setoptions()
+
+    def crawl(self, url):
+        if self.crawler is None:
+            print "Cralwer is not setted up"
+            return
+
+        parsedurl = urlparse(url)
+        domain = parsedurl.scheme + "://" + parsedurl.netloc
+
+        self.links = []
+        self.crawler.start_with(Request(domain))
+        return self.links
+
+    def setoptions(self, depth=1):
+        """Define how far user want to crawl"""
+
+        options = Options()
+        options.scope.max_depth = depth
+        options.callbacks.crawler_before_start = self.crawlerstart
+        options.callbacks.crawler_after_finish = self.crawlerfinish
+        options.callbacks.request_before_start = self.requeststart
+        options.callbacks.request_after_finish = self.requestfinish
+
+        self.crawler = nyawcCrawler(options)
+
+    def crawlerstart(self):
+        # Called before the crawler starts crawling. Default is a null route.
+        pass
+
+    def crawlerfinish(self, queue):
+        # Called after the crawler finished crawling. Default is a null route.
+        pass
+
+    def requeststart(self, queue, queue_item):
+        # Called before the crawler starts a new request. Default is a null route.
+        return CrawlerActions.DO_CONTINUE_CRAWLING
+
+    def requestfinish(self, queue, queue_item, new_queue_items):
+        # Called after the crawler finishes a request. Default is a null route.
+        url = queue_item.request.url
+        if re.search('(.*?)(.php\?|.asp\?|.apsx\?|.jsp\?)(.*?)=(.*?)', url):
+            if not url in self.links:
+                self.links.append(url)
+        return CrawlerActions.DO_CONTINUE_CRAWLING

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 bs4
 termcolor
 terminaltables
 +nyawc