diff --git a/.gitignore b/.gitignore index c0b8c74..c2969b0 100644 --- a/.gitignore +++ b/.gitignore @@ -136,4 +136,10 @@ dmypy.json # Cython debug symbols cython_debug/ -.vscode/ \ No newline at end of file +.vscode/### Example user template template + +# IntelliJ project files +.idea +*.iml +out +gen diff --git a/README.md b/README.md index 912c3aa..4c6a053 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ A very basic API to scrape product reviews from Amazon and get data in real time as JSON with all fields, that amazon product adverstising api does not provide you. -Full Tutorail - [Amazon Product Reviews API – build you own using Python](https://www.scrapehero.com/free-amazon-product-reviews-api-build-you-own-using-python/) +Forked from [ScrapeHero/Amazon-Review-Scraper](https://github.com/scrapehero-code/amazon-review-scraper) ## Usage @@ -11,7 +11,7 @@ Go into the project folder 1. Install requirements `pip install -r requirements.txt` 2. Set FLASK_APP - `export FLASK_APP=app.py` 3. Run App - `flask run` -4. Call API with Review Page URL. Eg: `http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews` +4. Call API with Review Page URL. Eg: [`http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews`](http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews) ## Example Data Format diff --git a/app.py b/app.py index c4b67e4..ef3e242 100644 --- a/app.py +++ b/app.py @@ -1,11 +1,14 @@ -from flask import Flask, request, jsonify +from flask import Flask, request import selectorlib import requests +import json from dateutil import parser as dateparser + app = Flask(__name__) extractor = selectorlib.Extractor.from_yaml_file('selectors.yml') -def scrape(url): + +def scrape(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', @@ -21,45 +24,74 @@ def scrape(url): } # Download the page using requests - print("Downloading %s"%url) + print("Downloading %s" % url) r = requests.get(url, headers=headers) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: - print("Page %s was blocked by Amazon. Please try using better proxies\n"%url) + raise Exception("Page %s was blocked by Amazon. Please try using better proxies\n" % url) else: - print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) - return None - # Pass the HTML of the page and create - data = extractor.extract(r.text,base_url=url) + raise Exception("Page %s must have been blocked by Amazon as the status code was %d" % (url, r.status_code)) + + # Pass the HTML of the page and create + data = extractor.extract(r.text, base_url=url) + + # check if the extracted data is empty + if data['reviews'] is None: + raise Exception("ERROR: No data extracted. Check selector config") + reviews = [] for r in data['reviews']: - r["product"] = data["product_title"] + r['rating'] = int(float(r['title'].split(' out of')[0])) + r['title'] = r['title'].split(' out of 5 stars ')[-1] + r['product'] = data['product_title'] r['url'] = url - if 'verified_purchase' in r: + if r['found_helpful'] is None: + r['found_helpful'] = 0 + elif 'One person found this helpful' in r['found_helpful']: + r['found_helpful'] = 1 + elif 'people found this helpful' in r['found_helpful']: + r['found_helpful'] = int(r['found_helpful'].split()[0]) + else: + r['found_helpful'] = 0 + if 'verified_purchase' in r and r['verified_purchase'] is not None: if 'Verified Purchase' in r['verified_purchase']: r['verified_purchase'] = True else: r['verified_purchase'] = False - r['rating'] = r['rating'].split(' out of')[0] date_posted = r['date'].split('on ')[-1] if r['images']: r['images'] = "\n".join(r['images']) r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y') reviews.append(r) + data['reviews'] = reviews histogram = {} for h in data['histogram']: histogram[h['key']] = h['value'] data['histogram'] = histogram data['average_rating'] = float(data['average_rating'].split(' out')[0]) - data['reviews'] = reviews - data['number_of_reviews'] = int(data['number_of_reviews'].split(' customer')[0]) - return data - + data['number_of_reviews'] = int(data['number_of_reviews'].split(' global ratings')[0].replace(',', '')) + return data + + +def to_json(data, status=200): + return json.dumps(data, indent=2), status, {'Content-Type': 'application/json; charset=utf-8'} + + @app.route('/') def api(): - url = request.args.get('url',None) + url = request.args.get('url', None) + if request.args.get('pageNumber', None) is None: + url += '&pageNumber=1' + elif int(request.args.get('pageNumber', None)) <= 10: + url += '&pageNumber=' + request.args.get('pageNumber', None) + else: + return to_json({'error': 'Page number should be less than or equal to 10'}, 400) + if url: - data = scrape(url) - return jsonify(data) - return jsonify({'error':'URL to scrape is not provided'}),400 \ No newline at end of file + try: + data = scrape(url) + return to_json(data) + except Exception as e: + return to_json({'error': str(e)}, 400) + return to_json({'error': 'URL to scrape is not provided'}, 400) diff --git a/requirements.txt b/requirements.txt index e0f806e..be90693 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,5 @@ -click==7.1.1 -cssselect==1.1.0 -Flask==1.1.2 -itsdangerous==1.1.0 -Jinja2==2.11.2 -lxml==4.5.0 -MarkupSafe==1.1.1 -parsel==1.5.2 -python-dateutil==2.8.1 -PyYAML==5.3.1 -selectorlib==0.16.0 -six==1.14.0 -w3lib==1.21.0 -Werkzeug==1.0.1 +flask +selectorlib +requests +json +dateutil \ No newline at end of file