scrapehero-code · alyosharomanov · Nov 18, 2023 · Nov 19, 2023 · Nov 20, 2023 · Nov 26, 2023
diff --git a/.gitignore b/.gitignore
@@ -136,4 +136,10 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
-.vscode/
+.vscode/### Example user template template
+
+# IntelliJ project files
+.idea
+*.iml
+out
+gen
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 A very basic API to scrape product reviews from Amazon and get data in real time as JSON with all fields, that amazon product adverstising api does not provide you. 
 
-Full Tutorail - [Amazon Product Reviews API – build you own using Python](https://www.scrapehero.com/free-amazon-product-reviews-api-build-you-own-using-python/)
+Forked from [ScrapeHero/Amazon-Review-Scraper](https://github.com/scrapehero-code/amazon-review-scraper)
 
 ## Usage
 
@@ -11,7 +11,7 @@ Go into the project folder
 1. Install requirements `pip install -r requirements.txt`
 2. Set FLASK_APP - `export FLASK_APP=app.py`
 3. Run App - `flask run`
-4. Call API with Review Page URL. Eg: `http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews`
+4. Call API with Review Page URL. Eg: [`http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews`](http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews)
 
 ## Example Data Format
 

diff --git a/app.py b/app.py
@@ -1,11 +1,14 @@
-from flask import Flask, request, jsonify
+from flask import Flask, request
 import selectorlib
 import requests
+import json
 from dateutil import parser as dateparser
+
 app = Flask(__name__)
 extractor = selectorlib.Extractor.from_yaml_file('selectors.yml')
 
-def scrape(url):    
+
+def scrape(url):
     headers = {
         'authority': 'www.amazon.com',
         'pragma': 'no-cache',
@@ -21,45 +24,74 @@ def scrape(url):
     }
 
     # Download the page using requests
-    print("Downloading %s"%url)
+    print("Downloading %s" % url)
     r = requests.get(url, headers=headers)
     # Simple check to check if page was blocked (Usually 503)
     if r.status_code > 500:
         if "To discuss automated access to Amazon data please contact" in r.text:
-            print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
+            raise Exception("Page %s was blocked by Amazon. Please try using better proxies\n" % url)
         else:
-            print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
-        return None
-    # Pass the HTML of the page and create 
-    data = extractor.extract(r.text,base_url=url)
+            raise Exception("Page %s must have been blocked by Amazon as the status code was %d" % (url, r.status_code))
+
+    # Pass the HTML of the page and create
+    data = extractor.extract(r.text, base_url=url)
+
+    # check if the extracted data is empty
+    if data['reviews'] is None:
+        raise Exception("ERROR: No data extracted. Check selector config")
+
     reviews = []
     for r in data['reviews']:
-        r["product"] = data["product_title"]
+        r['rating'] = int(float(r['title'].split(' out of')[0]))
+        r['title'] = r['title'].split(' out of 5 stars ')[-1]
+        r['product'] = data['product_title']
         r['url'] = url
-        if 'verified_purchase' in r:
+        if r['found_helpful'] is None:
+            r['found_helpful'] = 0
+        elif 'One person found this helpful' in r['found_helpful']:
+            r['found_helpful'] = 1
+        elif 'people found this helpful' in r['found_helpful']:
+            r['found_helpful'] = int(r['found_helpful'].split()[0])
+        else:
+            r['found_helpful'] = 0
+        if 'verified_purchase' in r and r['verified_purchase'] is not None:
             if 'Verified Purchase' in r['verified_purchase']:
                 r['verified_purchase'] = True
             else:
                 r['verified_purchase'] = False
-        r['rating'] = r['rating'].split(' out of')[0]
         date_posted = r['date'].split('on ')[-1]
         if r['images']:
             r['images'] = "\n".join(r['images'])
         r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y')
         reviews.append(r)
+    data['reviews'] = reviews
     histogram = {}
     for h in data['histogram']:
         histogram[h['key']] = h['value']
     data['histogram'] = histogram
     data['average_rating'] = float(data['average_rating'].split(' out')[0])
-    data['reviews'] = reviews
-    data['number_of_reviews'] = int(data['number_of_reviews'].split('  customer')[0])
-    return data 
-
+    data['number_of_reviews'] = int(data['number_of_reviews'].split(' global ratings')[0].replace(',', ''))
+    return data
+
+
+def to_json(data, status=200):
+    return json.dumps(data, indent=2), status, {'Content-Type': 'application/json; charset=utf-8'}
+
+
 @app.route('/')
 def api():
-    url = request.args.get('url',None)
+    url = request.args.get('url', None)
+    if request.args.get('pageNumber', None) is None:
+        url += '&pageNumber=1'
+    elif int(request.args.get('pageNumber', None)) <= 10:
+        url += '&pageNumber=' + request.args.get('pageNumber', None)
+    else:
+        return to_json({'error': 'Page number should be less than or equal to 10'}, 400)
+
     if url:
-        data = scrape(url)
-        return jsonify(data)
-    return jsonify({'error':'URL to scrape is not provided'}),400
+        try:
+            data = scrape(url)
+            return to_json(data)
+        except Exception as e:
+            return to_json({'error': str(e)}, 400)
+    return to_json({'error': 'URL to scrape is not provided'}, 400)
diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,5 @@
-click==7.1.1
-cssselect==1.1.0
-Flask==1.1.2
-itsdangerous==1.1.0
-Jinja2==2.11.2
-lxml==4.5.0
-MarkupSafe==1.1.1
-parsel==1.5.2
-python-dateutil==2.8.1
-PyYAML==5.3.1
-selectorlib==0.16.0
-six==1.14.0
-w3lib==1.21.0
-Werkzeug==1.0.1
+flask
+selectorlib
+requests
+json
+dateutil