Lecture 18: Mongo DB II¶
In [1]:
import json
import pymongo
import pprint
In [2]:
# instead of saving a full copy of existing data,
# just symlink to previous lecture. a bit hacky, but saves space!
!ln -sf ../lec17-mongo_i/data data
In [2]:
# this is a utility function we define so that find_all() prints nicely
def pretty_print(output_collection, n_to_print=3):
"""
note if n_to_print is -1, this print everything
"""
for i, doc in enumerate(output_collection):
pprint.pprint(doc) # nicely formats each document
if i+1 == n_to_print: return
In [ ]:
Aggregation Queries¶
Zips JSON from the MongoDB Aggregation Tutorial: https://www.mongodb.com/docs/manual/tutorial/aggregation-zip-code-data-set/
Load zips.json
into new local.zips
¶
For the sake of simplicity, we'll make a new collection zips
in a new aggquerydb
database.
In [3]:
# reimport/redefine as needed
import json
import pymongo
import pprint
def pretty_print(output_collection, n_to_print=3):
"""
note if n_to_print is -1, this print everything
"""
for i, doc in enumerate(output_collection):
pprint.pprint(doc) # nicely formats each document
if i+1 == n_to_print: return
In [5]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()
Out[5]:
['admin', 'config', 'local', 'nobel_prizes']
In [6]:
# run this cell to make the new collection
# and insert zipcode documents
client.drop_database('aggquerydb') # if already exists
db = client.aggquerydb
with open('data/zips.json', encoding='utf-8') as f:
for line in f.readlines():
db.zips.insert_one(json.loads(line))
In [7]:
db.zips.count_documents({})
Out[7]:
29353
$group
¶
In [7]:
# Just the grouping, no filtering
output = db.zips.aggregate( [
{ "$group": { "_id": "$state",
"totalPop":
{ "$sum": "$pop" } } },
] )
pretty_print(output, n_to_print=5)
{'_id': 'ND', 'totalPop': 638272} {'_id': 'AR', 'totalPop': 2350725} {'_id': 'HI', 'totalPop': 1108229} {'_id': 'WV', 'totalPop': 1793146} {'_id': 'TX', 'totalPop': 16984601}
In [9]:
output = db.zips.aggregate( [
{ "$group": { "_id": "$state",
"totalPop":
{ "$sum": "$pop" } } },
{ "$match": { "totalPop":
{ "$gte": 15000000 } } },
{ "$sort" : { "totalPop" : -1 } }
] )
pretty_print(output, n_to_print=10)
{'_id': 'CA', 'totalPop': 29754890} {'_id': 'NY', 'totalPop': 17990402} {'_id': 'TX', 'totalPop': 16984601}
Aggregation Queries: Unwind/Lookup¶
Make a new collection, inventory
.
In [10]:
db = client.aggquerydb # stay in same database
db.drop_collection('inventory') # recreate as needed
db.inventory.insert_many( [
{ "item": "journal",
"tags": ["blank", "red"],
"dim": [ 14, 21 ],
"instock": [ { "loc": "A", "qty": 5 }, { "loc": "C", "qty": 15 } ]
},
{ "item": "notebook",
"tags": ["red", "blank"],
"dim": [ 14, 21 ],
"instock": [ { "loc": "C", "qty": 5 } ]
},
{ "item": "paper",
"tags": ["red", "blank", "plain"],
"dim": [ 14, 21 ],
"instock": [ { "loc": "A", "qty": 60 }, { "loc": "B", "qty": 15 } ]
},
{ "item": "planner",
"tags": ["blank", "red"],
"dim": [ 22.85, 30 ] ,
"instock": [ { "loc": "A", "qty": 40 }, { "loc": "B", "qty": 5 } ]
},
{ "item": "postcard",
"tags": ["blue"],
"dim": [ 10, 15.25 ],
"instock": [ { "loc": "E", "qty": 15 }, { "loc": "D", "qty": 35 } ]
}
]);
In [ ]:
In [11]:
pretty_print(db.inventory.find({}))
{'_id': ObjectId('6723bcb2c96a1d9cd406ab20'), 'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'tags': ['blank', 'red']} {'_id': ObjectId('6723bcb2c96a1d9cd406ab21'), 'dim': [14, 21], 'instock': [{'loc': 'C', 'qty': 5}], 'item': 'notebook', 'tags': ['red', 'blank']} {'_id': ObjectId('6723bcb2c96a1d9cd406ab22'), 'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}], 'item': 'paper', 'tags': ['red', 'blank', 'plain']}
Unwind example¶
In [12]:
# Notice, tags is no longer an array
output = db.inventory.aggregate( [
{ "$unwind" : "$tags" }
] )
pretty_print(output, n_to_print=4)
{'_id': ObjectId('6723bcb2c96a1d9cd406ab20'), 'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'tags': 'blank'} {'_id': ObjectId('6723bcb2c96a1d9cd406ab20'), 'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'tags': 'red'} {'_id': ObjectId('6723bcb2c96a1d9cd406ab21'), 'dim': [14, 21], 'instock': [{'loc': 'C', 'qty': 5}], 'item': 'notebook', 'tags': 'red'} {'_id': ObjectId('6723bcb2c96a1d9cd406ab21'), 'dim': [14, 21], 'instock': [{'loc': 'C', 'qty': 5}], 'item': 'notebook', 'tags': 'blank'}
In [13]:
output = db.inventory.aggregate( [
{ "$unwind" : "$tags" },
{ "$project" : {"_id" : 0, "instock": 0}}
] )
pretty_print(output, n_to_print=-1)
{'dim': [14, 21], 'item': 'journal', 'tags': 'blank'} {'dim': [14, 21], 'item': 'journal', 'tags': 'red'} {'dim': [14, 21], 'item': 'notebook', 'tags': 'red'} {'dim': [14, 21], 'item': 'notebook', 'tags': 'blank'} {'dim': [14, 21], 'item': 'paper', 'tags': 'red'} {'dim': [14, 21], 'item': 'paper', 'tags': 'blank'} {'dim': [14, 21], 'item': 'paper', 'tags': 'plain'} {'dim': [22.85, 30], 'item': 'planner', 'tags': 'blank'} {'dim': [22.85, 30], 'item': 'planner', 'tags': 'red'} {'dim': [10, 15.25], 'item': 'postcard', 'tags': 'blue'}
In [14]:
output = db.inventory.aggregate( [
{ "$unwind" : "$instock" },
{ "$group" : { "_id" : "$item", "totalqty" : {"$sum" : "$instock.qty"}}}
] )
pretty_print(output, n_to_print=-1)
{'_id': 'planner', 'totalqty': 45} {'_id': 'journal', 'totalqty': 20} {'_id': 'postcard', 'totalqty': 50} {'_id': 'paper', 'totalqty': 75} {'_id': 'notebook', 'totalqty': 5}
In [16]:
output = db.inventory.aggregate( [
{ "$lookup" : {
"from" : "inventory",
"localField": "instock.loc",
"foreignField": "instock.loc",
"as":"otheritems"}
},
{ "$project" :
{"_id" : 0, "tags" : 0, "dim" : 0, "otheritems._id": 0}
}
] )
pretty_print(output, n_to_print=2)
{'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'otheritems': [{'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'tags': ['blank', 'red']}, {'dim': [14, 21], 'instock': [{'loc': 'C', 'qty': 5}], 'item': 'notebook', 'tags': ['red', 'blank']}, {'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}], 'item': 'paper', 'tags': ['red', 'blank', 'plain']}, {'dim': [22.85, 30], 'instock': [{'loc': 'A', 'qty': 40}, {'loc': 'B', 'qty': 5}], 'item': 'planner', 'tags': ['blank', 'red']}]} {'instock': [{'loc': 'C', 'qty': 5}], 'item': 'notebook', 'otheritems': [{'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'tags': ['blank', 'red']}, {'dim': [14, 21], 'instock': [{'loc': 'C', 'qty': 5}], 'item': 'notebook', 'tags': ['red', 'blank']}]}
Multiple Attribute Grouping¶
In [17]:
# reimport/redefine as needed
import json
import pymongo
import pprint
def pretty_print(output_collection, n_to_print=3):
"""
note if n_to_print is -1, this print everything
"""
for i, doc in enumerate(output_collection):
pprint.pprint(doc) # nicely formats each document
if i+1 == n_to_print: return
In [19]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()
Out[19]:
['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes']
In [20]:
db = client.aggquerydb
In [24]:
db.zips.count_documents({})
Out[24]:
29353
In [18]:
db.zips.find_one()
Out[18]:
{'_id': '01001', 'city': 'AGAWAM', 'loc': [-72.622739, 42.070206], 'pop': 15338, 'state': 'MA'}
- What is this doing?
In [19]:
# Break things down by steps if necessary.
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
"pop": { "$sum": "$pop" } }
}
] )
pretty_print(output, n_to_print=10)
{'_id': {'city': 'VALMEYER', 'state': 'IL'}, 'pop': 1841} {'_id': {'city': 'PIEDMONT', 'state': 'MO'}, 'pop': 3719} {'_id': {'city': 'HOPKINSVILLE', 'state': 'KY'}, 'pop': 39331} {'_id': {'city': 'MINIER', 'state': 'IL'}, 'pop': 1483} {'_id': {'city': 'PARAMUS', 'state': 'NJ'}, 'pop': 25085} {'_id': {'city': 'SEVERNA PARK', 'state': 'MD'}, 'pop': 23392} {'_id': {'city': 'HUNTINGTON', 'state': 'AR'}, 'pop': 2608} {'_id': {'city': 'LUTHER', 'state': 'OK'}, 'pop': 2111} {'_id': {'city': 'STATE CENTER', 'state': 'IA'}, 'pop': 1618} {'_id': {'city': 'TAFT', 'state': 'OK'}, 'pop': 1822}
In [20]:
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
"pop": { "$sum": "$pop" } }
},
{ "$group": { "_id": "$_id.state",
"avgCityPop": { "$avg": "$pop" } }
}
] )
pretty_print(output, n_to_print=10)
{'_id': 'NY', 'avgCityPop': 13131.680291970803} {'_id': 'MT', 'avgCityPop': 2593.987012987013} {'_id': 'MO', 'avgCityPop': 5672.195338512764} {'_id': 'CO', 'avgCityPop': 9981.075757575758} {'_id': 'CA', 'avgCityPop': 27756.42723880597} {'_id': 'KY', 'avgCityPop': 4767.164721141375} {'_id': 'OK', 'avgCityPop': 6155.743639921722} {'_id': 'NV', 'avgCityPop': 18209.590909090908} {'_id': 'MD', 'avgCityPop': 12615.775725593667} {'_id': 'DC', 'avgCityPop': 303450.0}
- What is this doing?
In [25]:
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
"pop": { "$sum": "$pop" } } },
{ "$sort": { "pop": -1 } },
{ "$group": { "_id" : "$_id.state",
"bigCity": { "$first": "$_id.city" },
"bigPop": { "$first": "$pop" } } },
{ "$sort" : {"bigPop" : -1} },
{ "$project" : {"bigPop" : 0} }
] )
pretty_print(output, n_to_print=10)
{'_id': 'IL', 'bigCity': 'CHICAGO'} {'_id': 'NY', 'bigCity': 'BROOKLYN'} {'_id': 'CA', 'bigCity': 'LOS ANGELES'} {'_id': 'TX', 'bigCity': 'HOUSTON'} {'_id': 'PA', 'bigCity': 'PHILADELPHIA'} {'_id': 'MI', 'bigCity': 'DETROIT'} {'_id': 'AZ', 'bigCity': 'PHOENIX'} {'_id': 'FL', 'bigCity': 'MIAMI'} {'_id': 'MD', 'bigCity': 'BALTIMORE'} {'_id': 'TN', 'bigCity': 'MEMPHIS'}
In [ ]:
- What is this doing?
In [27]:
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
"pop": { "$sum": "$pop" } } },
{ "$sort": { "pop": -1 } },
{ "$group": { "_id" : "$_id.state",
"bigCity": { "$first": "$_id.city" },
"bigPop": { "$first": "$pop" } } },
{ "$sort" : {"bigPop" : -1} },
{ "$project" : { "_id" : 0, "state" : "$_id",
"bigCityDeets": { "name": "$bigCity", "pop": "$bigPop" } } }
] )
pretty_print(output, n_to_print=10)
{'bigCityDeets': {'name': 'CHICAGO', 'pop': 2452177}, 'state': 'IL'} {'bigCityDeets': {'name': 'BROOKLYN', 'pop': 2300504}, 'state': 'NY'} {'bigCityDeets': {'name': 'LOS ANGELES', 'pop': 2102295}, 'state': 'CA'} {'bigCityDeets': {'name': 'HOUSTON', 'pop': 2095918}, 'state': 'TX'} {'bigCityDeets': {'name': 'PHILADELPHIA', 'pop': 1610956}, 'state': 'PA'} {'bigCityDeets': {'name': 'DETROIT', 'pop': 963243}, 'state': 'MI'} {'bigCityDeets': {'name': 'PHOENIX', 'pop': 890853}, 'state': 'AZ'} {'bigCityDeets': {'name': 'MIAMI', 'pop': 825232}, 'state': 'FL'} {'bigCityDeets': {'name': 'BALTIMORE', 'pop': 733081}, 'state': 'MD'} {'bigCityDeets': {'name': 'MEMPHIS', 'pop': 632837}, 'state': 'TN'}
In [ ]:
[Extra] Aggregation Pipeline Demos¶
Nobel laureate data from Kaggle: https://www.kaggle.com/datasets/imdevskp/nobel-prize/data
In [30]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()
client.drop_database('nobel_prizes') # if already exists
db = client.nobel_prizes
# run this cell to insert into the collection prizes
with open('data/prize.json', encoding='utf-8') as f:
db.prizes.insert_many(json.loads(f.read()))
print("databases", client.list_database_names())
db = client.nobel_prizes
print("collections in nobel_prizes database", db.list_collection_names())
databases ['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes'] collections in nobel_prizes database ['prizes']
In [ ]:
In [ ]:
# A
output = db.prizes.aggregate([{"$group": {"_id": "$category",
"awardyears": {"$sum" : 1}}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# B
output = db.prizes.aggregate([{"$group": {"_id": "$category",
"awardyears": {"$sum" : 1}}},
{"$match" : {"awardyears": {"$lt": 100}}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# C
output = db.prizes.aggregate([{"$group": {"_id": "$category",
"awardyears": {"$sum" : 1}}},
{"$match" : {"awardyears": {"$lt": 100}}},
{"$project" : {"_id": 0, "awardyears": 1}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# D
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": "$category",
"awards": {"$sum" : 1}}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# E
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": {"category": "$category",
"year": "$year"},
"awards": {"$sum" : 1}}}])
pretty_print(output, n_to_print=10)
In [ ]:
# F
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": {"category": "$category",
"year": "$year"},
"awards": {"$sum" : 1}}},
{"$sort" : {"awards": -1}}])
pretty_print(output, n_to_print=15)
In [ ]:
# G
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": {"category": "$category",
"year": "$year"},
"awards": {"$sum" : 1}}},
{"$group": {"_id":"$_id.category",
"avgawards": {"$avg" : "$awards"}}}])
pretty_print(output, n_to_print=-1)