|
1 | 1 | import re
|
2 | 2 | from urlparse import urlparse
|
3 | 3 |
|
4 |
| -from web import web |
5 |
| - |
6 |
| -def parameterControl(URL): |
7 |
| - for site in links: |
8 |
| - if URL.split("=")[0] in site: |
9 |
| - return False |
10 |
| - |
11 |
| - return True |
12 |
| - |
13 |
| -def crawl(url): |
14 |
| - """crawl the links of the same given domain""" |
15 |
| - global links |
16 |
| - |
17 |
| - links = [] |
18 |
| - |
19 |
| - try: |
20 |
| - result, URL = web.gethtml(url, lastURL=True) |
21 |
| - except: |
22 |
| - return None |
23 |
| - |
24 |
| - if result: |
25 |
| - # get only domain name |
26 |
| - domain = 'http://' + '/'.join(URL.split('/')[2:-1]) + '/' if len(URL.split('/')) >= 4 else URL.rstrip('/') + '/' |
27 |
| - |
28 |
| - for link in re.findall('<a href="(.*?)"', result): |
29 |
| - # www.example.com/index.(php|aspx|jsp)?query=1 |
30 |
| - if re.search('(.*?)(.php\?|.asp\?|.apsx\?|.jsp\?)(.*?)=(.*?)', link): |
31 |
| - if parameterControl(link) == True: |
32 |
| - if link.startswith(("http", "www")) or domain in urlparse(link).path: |
33 |
| - links.append(link) |
34 |
| - else: |
35 |
| - links.append(domain + link if link.startswith("/") else domain + link) |
36 |
| - |
37 |
| - return links |
| 4 | +#import std |
| 5 | +from nyawc.Options import Options |
| 6 | +from nyawc.QueueItem import QueueItem |
| 7 | +from nyawc.Crawler import Crawler as nyawcCrawler |
| 8 | +from nyawc.CrawlerActions import CrawlerActions |
| 9 | +from nyawc.http.Request import Request |
| 10 | + |
| 11 | + |
| 12 | +class Crawler: |
| 13 | + def __init__(self): |
| 14 | + self.links = [] |
| 15 | + self.crawler = None |
| 16 | + self.setoptions() |
| 17 | + |
| 18 | + def crawl(self, url): |
| 19 | + if self.crawler is None: |
| 20 | + print "Cralwer is not setted up" |
| 21 | + return |
| 22 | + |
| 23 | + parsedurl = urlparse(url) |
| 24 | + domain = parsedurl.scheme + "://" + parsedurl.netloc |
| 25 | + |
| 26 | + self.links = [] |
| 27 | + self.crawler.start_with(Request(domain)) |
| 28 | + return self.links |
| 29 | + |
| 30 | + def setoptions(self, depth=1): |
| 31 | + """Define how far user want to crawl""" |
| 32 | + |
| 33 | + options = Options() |
| 34 | + options.scope.max_depth = depth |
| 35 | + options.callbacks.crawler_before_start = self.crawlerstart |
| 36 | + options.callbacks.crawler_after_finish = self.crawlerfinish |
| 37 | + options.callbacks.request_before_start = self.requeststart |
| 38 | + options.callbacks.request_after_finish = self.requestfinish |
| 39 | + |
| 40 | + self.crawler = nyawcCrawler(options) |
| 41 | + |
| 42 | + def crawlerstart(self): |
| 43 | + # Called before the crawler starts crawling. Default is a null route. |
| 44 | + pass |
| 45 | + |
| 46 | + def crawlerfinish(self, queue): |
| 47 | + # Called after the crawler finished crawling. Default is a null route. |
| 48 | + pass |
| 49 | + |
| 50 | + def requeststart(self, queue, queue_item): |
| 51 | + # Called before the crawler starts a new request. Default is a null route. |
| 52 | + return CrawlerActions.DO_CONTINUE_CRAWLING |
| 53 | + |
| 54 | + def requestfinish(self, queue, queue_item, new_queue_items): |
| 55 | + # Called after the crawler finishes a request. Default is a null route. |
| 56 | + url = queue_item.request.url |
| 57 | + if re.search('(.*?)(.php\?|.asp\?|.apsx\?|.jsp\?)(.*?)=(.*?)', url): |
| 58 | + if not url in self.links: |
| 59 | + self.links.append(url) |
| 60 | + return CrawlerActions.DO_CONTINUE_CRAWLING |
0 commit comments