|
| 1 | +# PyMongo Tutorial : Insert, Read, Update, Delete in MongoDB |
| 2 | + |
| 3 | +# !/usr/bin/env python3.5 |
| 4 | +# -*- coding: UTF-8 -*- |
| 5 | + |
| 6 | +import pprint |
| 7 | +import dateutil.parser |
| 8 | +import datetime |
| 9 | + |
| 10 | +try: |
| 11 | + from pymongo import MongoClient |
| 12 | +except ImportError: |
| 13 | + raise ImportError('PyMongo is not installed in your machine.') |
| 14 | + |
| 15 | +# The below code will connect on the default host and port. |
| 16 | +client = MongoClient() |
| 17 | + |
| 18 | +# The below code will connect on the specified host and port. |
| 19 | +client = MongoClient(host='127.0.0.1', port=27017, maxPoolSize=100) |
| 20 | + |
| 21 | +# selecting database (use db) |
| 22 | +database = client['test'] |
| 23 | + |
| 24 | +# selecting collection - kind of selecting table in NoSQL |
| 25 | +collection = database['restaurants'] |
| 26 | + |
| 27 | +# get/query/fetch only one value from collection without any conditions |
| 28 | +result = collection.find_one() |
| 29 | + |
| 30 | +# get/query/fetch all documents from collection without any conditions. |
| 31 | +# Below will return all values from collection |
| 32 | +result = collection.find() |
| 33 | + |
| 34 | +# inserting a sample document |
| 35 | +sample_document_post = { |
| 36 | + "address": { |
| 37 | + "street": "2 Avenue", |
| 38 | + "zipcode": "10075", |
| 39 | + "building": "1480", |
| 40 | + "coord": [-73.9557413, 40.7720266] |
| 41 | + }, |
| 42 | + "borough": "Manhattan", |
| 43 | + "cuisine": "Italian", |
| 44 | + "grades": [ |
| 45 | + { |
| 46 | + "date": dateutil.parser.parse("2014-10-01T00:00:00Z"), |
| 47 | + "grade": "A", |
| 48 | + "score": 11 |
| 49 | + }, |
| 50 | + { |
| 51 | + "date": dateutil.parser.parse("2014-01-16T00:00:00Z"), |
| 52 | + "grade": "B", |
| 53 | + "score": 17 |
| 54 | + } |
| 55 | + ], |
| 56 | + "name": "Vella", |
| 57 | + "restaurant_id": "41704620" |
| 58 | +} |
| 59 | + |
| 60 | +# post_id = collection.insert(sample_document_post) |
| 61 | +# pprint.pprint(post_id) |
| 62 | + |
| 63 | +# Query by a Top Level Field |
| 64 | +''' I am trying to get the above restaurant ID from collection ''' |
| 65 | +result = collection.find({"restaurant_id": "41704620"}) |
| 66 | + |
| 67 | +# Query by a Top Level Field |
| 68 | +result = collection.find_one({"restaurant_id": "41704620"}) |
| 69 | + |
| 70 | +# Query by a Top Level Field |
| 71 | +''' The following operation finds documents whose borough field equals "Manhattan". ''' |
| 72 | +result = collection.find({"borough": "Manhattan"}) |
| 73 | + |
| 74 | +# Query by a Field in an Embedded Document |
| 75 | +''' |
| 76 | +To specify a condition on a field within an embedded document, use the dot notation. |
| 77 | +Dot notation requires quotes around the whole dotted field name. |
| 78 | +The following operation specifies an equality condition on the zipcode field in the address embedded document. |
| 79 | +Example - |
| 80 | +{"address": { |
| 81 | + "street": "2 Avenue", |
| 82 | + "zipcode": "10075", |
| 83 | + "building": "1480", |
| 84 | + "coord": [-73.9557413, 40.7720266 ] |
| 85 | + }} |
| 86 | +''' |
| 87 | +result = collection.find({"address.zipcode": "10075"}) |
| 88 | + |
| 89 | +# Query by a Field in an Array |
| 90 | +''' |
| 91 | +The grades array contains embedded documents as its elements. |
| 92 | +To specify a condition on a field in these documents, use the dot notation. |
| 93 | +Dot notation requires quotes around the whole dotted field name. |
| 94 | +The following queries for documents whose grades array contains an embedded document with a field grade equal to "B". |
| 95 | +Example - |
| 96 | +{"grades": [ |
| 97 | + { |
| 98 | + "date": dateutil.parser.parse("2014-10-01T00:00:00Z"), |
| 99 | + "grade": "A", |
| 100 | + "score": 11 |
| 101 | + }, |
| 102 | + { |
| 103 | + "date": dateutil.parser.parse("2014-01-16T00:00:00Z"), |
| 104 | + "grade": "B", |
| 105 | + "score": 17 |
| 106 | + }]} |
| 107 | +''' |
| 108 | +result = collection.find({"grades.grade": "B"}) |
| 109 | + |
| 110 | +# Greater Than Operator ($gt) |
| 111 | +''' Query for documents whose grades array contains an embedded document with a field score greater than 30. ''' |
| 112 | +result = collection.find({"grades.score": {'$gt': 30}}) |
| 113 | + |
| 114 | +# Lesser Than Operator ($lt) |
| 115 | +''' Query for documents whose grades array contains an embedded document with a field score lesser than 10. ''' |
| 116 | +result = collection.find({"grades.score": {'$lt': 10}}) |
| 117 | + |
| 118 | +# Lesser Than Equals To Operator ($lte) / Greater Than Equals To Operator ($gte) |
| 119 | +''' I would like to get all documents in the year 2015 ''' |
| 120 | +start_date = datetime.datetime(year=2015, month=1, day=1) |
| 121 | +end_date = datetime.datetime(year=2015, month=12, day=31) |
| 122 | +result = collection.find({'grades.date': {'$gte': start_date, '$lte': end_date}}).count() |
| 123 | + |
| 124 | +# Combine Conditions - Logical AND |
| 125 | +''' |
| 126 | +You can specify a logical conjunction (AND) for a list of query conditions by |
| 127 | +separating the conditions with a comma in the conditions document |
| 128 | +''' |
| 129 | +result = collection.find({"cuisine": "Italian", "address.zipcode": "10075"}) |
| 130 | + |
| 131 | +# Combine Conditions - Logical OR |
| 132 | +''' |
| 133 | +You can specify a logical disjunction (OR) for a list of query conditions by using the $or query operator. |
| 134 | +''' |
| 135 | +result = collection.find({'$or': [{"cuisine": "Italian", "address.zipcode": "10075"}]}) |
| 136 | + |
| 137 | +# Sort Query Results |
| 138 | +''' |
| 139 | +To specify an order for the result set, append the sort() method to the query. |
| 140 | +
|
| 141 | +For example, the following operation returns all documents in the restaurants collection, |
| 142 | +sorted first by the borough field in ascending order, and then, within each borough, |
| 143 | +by the "address.zipcode" field in ascending order: |
| 144 | +''' |
| 145 | +result = collection.find().sort("borough").sort("address.zipcode") |
| 146 | + |
| 147 | +# Update Top-Level Fields |
| 148 | +''' |
| 149 | +The following operation updates the first document with name equal to "Juni", |
| 150 | +using the $set operator to update the cuisine field and the $currentDate operator to |
| 151 | +update the lastModified field with the current date. |
| 152 | +
|
| 153 | +upsert (boolean): True - if no matching documents found, then create a new one. |
| 154 | +multi (boolean): True - if update all the matching records. |
| 155 | +''' |
| 156 | +result = collection.update({ |
| 157 | + "name": "Juni"}, { |
| 158 | + '$set': {"cuisine": "American (New) Vijay Anand"}, |
| 159 | + '$currentDate': {"lastModified": True} |
| 160 | +}, upsert=False, multi=False) |
| 161 | + |
| 162 | + |
| 163 | +# Remove/Delete All Documents That Match a Condition |
| 164 | +result = collection.remove({"cuisine": "American (New) Vijay Anand"}) |
| 165 | + |
| 166 | +# Remove/Delete one Document - Use the justOne Option |
| 167 | +result = collection.remove({"borough": "Queens"}, {'$justOne': True}) |
| 168 | + |
| 169 | +# Total count of all value from collection |
| 170 | +count = collection.find().count() |
| 171 | +pprint.pprint('Total documents - {}'.format(count)) |
| 172 | + |
| 173 | +# Total count value from collection |
| 174 | +count = collection.find({"restaurant_id": "41704620"}).count() |
| 175 | +pprint.pprint('Total documents found with {} - {}'.format({"restaurant_id": "41704620"}, count)) |
| 176 | + |
| 177 | +# Group Documents by a Field and Calculate Count |
| 178 | +''' |
| 179 | +Use the $group stage to group by a specified key. In the $group stage, |
| 180 | +specify the group by key in the _id field. $group accesses fields by the |
| 181 | +field path, which is the field name prefixed by a dollar sign $. |
| 182 | +The $group stage can use accumulators to perform calculations for each group. |
| 183 | +The following example groups the documents in the restaurants collection by the |
| 184 | +borough field and uses the $sum accumulator to count the documents for each group. |
| 185 | +''' |
| 186 | +result = collection.aggregate([{'$group': {"_id": "$borough", "count": {'$sum': 1}}}]) |
| 187 | + |
| 188 | + |
| 189 | +# Filter and Group Documents |
| 190 | +''' The _id field contains the distinct zipcode value, i.e., the group by key value. ''' |
| 191 | +result = collection.aggregate([ |
| 192 | + {'$match': {"borough": "Brooklyn"}}, |
| 193 | + {'$group': {"_id": "$address.zipcode", "count": {'$sum': 1}}}]) |
| 194 | + |
| 195 | + |
| 196 | +# $in operator for getting matching documents |
| 197 | +borough = ['Missing', 'Manhattan'] |
| 198 | +result = collection.find({"borough": {'$in': borough}}) |
| 199 | + |
| 200 | + |
| 201 | +# get overall database, collection information |
| 202 | +details = dict((db, [collection for collection in client[db].collection_names()]) |
| 203 | + for db in client.database_names()) |
| 204 | +pprint.pprint(details) |
| 205 | + |
| 206 | + |
| 207 | +import pandas as pd |
| 208 | + |
| 209 | + |
| 210 | +def posts_2_df(iterator, chunk_size=1000): |
| 211 | + """ |
| 212 | + Turn an iterator into multiple small pandas.DataFrame |
| 213 | + This is a balance between memory and efficiency |
| 214 | + """ |
| 215 | + records = [] |
| 216 | + frames = [] |
| 217 | + for index, record in enumerate(iterator): |
| 218 | + records.append(record) |
| 219 | + if index % chunk_size == chunk_size - 1: |
| 220 | + frames.append(pd.DataFrame(records)) |
| 221 | + records = [] |
| 222 | + if records: |
| 223 | + frames.append(pd.DataFrame(records)) |
| 224 | + return pd.concat(frames) |
| 225 | + |
| 226 | +result = collection.find({'grades.date': {'$gte': start_date, '$lte': end_date}}) |
| 227 | +data_frame = posts_2_df(iterator=result, chunk_size=10000) |
| 228 | +print(data_frame.head()) |
0 commit comments