import json
import pymongo
import pprint
# instead of saving a full copy of existing data,
# just symlink to previous lecture. a bit hacky, but saves space!
!ln -sf ../lec18/data data
# this is a utility function we define so that find_all() prints nicely
def pretty_print(output_collection, n_to_print=3):
"""
note if n_to_print is -1, this print everything
"""
for i, doc in enumerate(output_collection):
pprint.pprint(doc) # nicely formats each document
if i+1 == n_to_print: return
Zips JSON from the MongoDB Aggregation Tutorial: https://www.mongodb.com/docs/manual/tutorial/aggregation-zip-code-data-set/
zips.json
into new local.zips
¶For the sake of simplicity, we'll make a new collection zips
in a new aggquerydb
database.
# reimport/redefine as needed
import json
import pymongo
import pprint
def pretty_print(output_collection, n_to_print=3):
"""
note if n_to_print is -1, this print everything
"""
for i, doc in enumerate(output_collection):
pprint.pprint(doc) # nicely formats each document
if i+1 == n_to_print: return
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()
['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes']
# run this cell to make the new collection
# and insert zipcode documents
client.drop_database('aggquerydb') # if already exists
db = client.aggquerydb
with open('data/zips.json', encoding='utf-8') as f:
for line in f.readlines():
db.zips.insert_one(json.loads(line))
db.zips.count_documents({})
29353
$group
¶output = db.zips.aggregate( [
{ "$group": { "_id": "$state",
"totalPop":
{ "$sum": "$pop" } } },
{ "$match": { "totalPop":
{ "$gte": 15000000 } } },
{ "$sort" : { "totalPop" : -1 } }
] )
pretty_print(output)
{'_id': 'CA', 'totalPop': 29754890} {'_id': 'NY', 'totalPop': 17990402} {'_id': 'TX', 'totalPop': 16984601}
Make a new collection, inventory
.
db = client.aggquerydb # stay in same database
db.drop_collection('inventory') # recreate as needed
db.inventory.insert_many( [
{ "item": "journal",
"tags": ["blank", "red"],
"dim": [ 14, 21 ],
"instock": [ { "loc": "A", "qty": 5 }, { "loc": "C", "qty": 15 } ]
},
{ "item": "notebook",
"tags": ["red", "blank"],
"dim": [ 14, 21 ],
"instock": [ { "loc": "C", "qty": 5 } ]
},
{ "item": "paper",
"tags": ["red", "blank", "plain"],
"dim": [ 14, 21 ],
"instock": [ { "loc": "A", "qty": 60 }, { "loc": "B", "qty": 15 } ]
},
{ "item": "planner",
"tags": ["blank", "red"],
"dim": [ 22.85, 30 ] ,
"instock": [ { "loc": "A", "qty": 40 }, { "loc": "B", "qty": 5 } ]
},
{ "item": "postcard",
"tags": ["blue"],
"dim": [ 10, 15.25 ],
"instock": [ { "loc": "E", "qty": 15 }, { "loc": "D", "qty": 35 } ]
}
]);
output = db.inventory.aggregate( [
{ "$unwind" : "$tags" },
{ "$project" : {"_id" : 0, "instock": 0}}
] )
pretty_print(output, n_to_print=-1)
{'dim': [14, 21], 'item': 'journal', 'tags': 'blank'} {'dim': [14, 21], 'item': 'journal', 'tags': 'red'} {'dim': [14, 21], 'item': 'notebook', 'tags': 'red'} {'dim': [14, 21], 'item': 'notebook', 'tags': 'blank'} {'dim': [14, 21], 'item': 'paper', 'tags': 'red'} {'dim': [14, 21], 'item': 'paper', 'tags': 'blank'} {'dim': [14, 21], 'item': 'paper', 'tags': 'plain'} {'dim': [22.85, 30], 'item': 'planner', 'tags': 'blank'} {'dim': [22.85, 30], 'item': 'planner', 'tags': 'red'} {'dim': [10, 15.25], 'item': 'postcard', 'tags': 'blue'}
output = db.inventory.aggregate( [
{ "$unwind" : "$instock" },
{ "$group" : { "_id" : "$item", "totalqty" : {"$sum" : "$instock.qty"}}}
] )
pretty_print(output, n_to_print=-1)
{'_id': 'paper', 'totalqty': 75} {'_id': 'notebook', 'totalqty': 5} {'_id': 'planner', 'totalqty': 45} {'_id': 'journal', 'totalqty': 20} {'_id': 'postcard', 'totalqty': 50}
output = db.inventory.aggregate( [
{ "$lookup" : {
"from" : "inventory",
"localField": "instock.loc",
"foreignField": "instock.loc",
"as":"otheritems"}
},
{ "$project" :
{"_id" : 0, "tags" : 0, "dim" : 0, "otheritems._id": 0}
} ] )
pretty_print(output, n_to_print=1)
{'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'otheritems': [{'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}], 'item': 'journal', 'tags': ['blank', 'red']}, {'dim': [14, 21], 'instock': [{'loc': 'C', 'qty': 5}], 'item': 'notebook', 'tags': ['red', 'blank']}, {'dim': [14, 21], 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}], 'item': 'paper', 'tags': ['red', 'blank', 'plain']}, {'dim': [22.85, 30], 'instock': [{'loc': 'A', 'qty': 40}, {'loc': 'B', 'qty': 5}], 'item': 'planner', 'tags': ['blank', 'red']}]}
db = client.aggquerydb
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
"pop": { "$sum": "$pop" } }
},
{ "$group": { "_id": "$_id.state",
"avgCityPop": { "$avg": "$pop" } }
}
] )
pretty_print(output, n_to_print=10)
{'_id': 'IL', 'avgCityPop': 9954.334494773519} {'_id': 'PA', 'avgCityPop': 8679.067202337472} {'_id': 'NH', 'avgCityPop': 5232.320754716981} {'_id': 'IA', 'avgCityPop': 3123.0821147356583} {'_id': 'FL', 'avgCityPop': 27400.958963282937} {'_id': 'AR', 'avgCityPop': 4175.355239786856} {'_id': 'LA', 'avgCityPop': 10465.496277915632} {'_id': 'RI', 'avgCityPop': 19292.653846153848} {'_id': 'OR', 'avgCityPop': 8262.561046511628} {'_id': 'NC', 'avgCityPop': 10622.815705128205}
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
"pop": { "$sum": "$pop" } } },
{ "$sort": { "pop": -1 } },
{ "$group": { "_id" : "$_id.state",
"bigCity": { "$first": "$_id.city" },
"bigPop": { "$first": "$pop" } } },
{ "$sort" : {"bigPop" : -1} },
{ "$project" : {"bigPop" : 0} }
] )
pretty_print(output, n_to_print=10)
{'_id': 'IL', 'bigCity': 'CHICAGO'} {'_id': 'NY', 'bigCity': 'BROOKLYN'} {'_id': 'CA', 'bigCity': 'LOS ANGELES'} {'_id': 'TX', 'bigCity': 'HOUSTON'} {'_id': 'PA', 'bigCity': 'PHILADELPHIA'} {'_id': 'MI', 'bigCity': 'DETROIT'} {'_id': 'AZ', 'bigCity': 'PHOENIX'} {'_id': 'FL', 'bigCity': 'MIAMI'} {'_id': 'MD', 'bigCity': 'BALTIMORE'} {'_id': 'TN', 'bigCity': 'MEMPHIS'}
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
"pop": { "$sum": "$pop" } } },
{ "$sort": { "pop": -1 } },
{ "$group": { "_id" : "$_id.state",
"bigCity": { "$first": "$_id.city" },
"bigPop": { "$first": "$pop" } } },
{ "$sort" : {"bigPop" : -1} },
{ "$project" : { "_id" : 0, "state" : "$_id",
"bigCityDeets": { "name": "$bigCity", "pop": "$bigPop" } } }
] )
pretty_print(output, n_to_print=10)
{'bigCityDeets': {'name': 'CHICAGO', 'pop': 2452177}, 'state': 'IL'} {'bigCityDeets': {'name': 'BROOKLYN', 'pop': 2300504}, 'state': 'NY'} {'bigCityDeets': {'name': 'LOS ANGELES', 'pop': 2102295}, 'state': 'CA'} {'bigCityDeets': {'name': 'HOUSTON', 'pop': 2095918}, 'state': 'TX'} {'bigCityDeets': {'name': 'PHILADELPHIA', 'pop': 1610956}, 'state': 'PA'} {'bigCityDeets': {'name': 'DETROIT', 'pop': 963243}, 'state': 'MI'} {'bigCityDeets': {'name': 'PHOENIX', 'pop': 890853}, 'state': 'AZ'} {'bigCityDeets': {'name': 'MIAMI', 'pop': 825232}, 'state': 'FL'} {'bigCityDeets': {'name': 'BALTIMORE', 'pop': 733081}, 'state': 'MD'} {'bigCityDeets': {'name': 'MEMPHIS', 'pop': 632837}, 'state': 'TN'}
Nobel laureate data from Kaggle: https://www.kaggle.com/datasets/imdevskp/nobel-prize/data
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()
client.drop_database('nobel_prizes') # if already exists
db = client.nobel_prizes
# run this cell to insert into the collection prizes
with open('data/prize.json', encoding='utf-8') as f:
db.prizes.insert_many(json.loads(f.read()))
print("databases", client.list_database_names())
db = client.nobel_prizes
print("collections in nobel_prizes database", db.list_collection_names())
databases ['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes'] collections in nobel_prizes database ['prizes']
# A
output = db.prizes.aggregate([{"$group": {"_id": "$category",
"awardyears": {"$sum" : 1}}}])
pretty_print(output, n_to_print=-1)
{'_id': 'chemistry', 'awardyears': 120} {'_id': 'peace', 'awardyears': 120} {'_id': 'physics', 'awardyears': 120} {'_id': 'economics', 'awardyears': 52} {'_id': 'literature', 'awardyears': 120} {'_id': 'medicine', 'awardyears': 120}
# B
output = db.prizes.aggregate([{"$group": {"_id": "$category",
"awardyears": {"$sum" : 1}}},
{"$match" : {"awardyears": {"$lt": 100}}}])
pretty_print(output, n_to_print=-1)
{'_id': 'economics', 'awardyears': 52}
# C
output = db.prizes.aggregate([{"$group": {"_id": "$category",
"awardyears": {"$sum" : 1}}},
{"$match" : {"awardyears": {"$lt": 100}}},
{"$project" : {"_id": 0, "awardyears": 1}}])
pretty_print(output, n_to_print=-1)
{'awardyears': 52}
# D
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": "$category",
"awards": {"$sum" : 1}}}])
pretty_print(output, n_to_print=-1)
{'_id': 'chemistry', 'awards': 186} {'_id': 'peace', 'awards': 135} {'_id': 'physics', 'awards': 216} {'_id': 'economics', 'awards': 86} {'_id': 'literature', 'awards': 117} {'_id': 'medicine', 'awards': 222}
# E
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": {"category": "$category",
"year": "$year"},
"awards": {"$sum" : 1}}}])
pretty_print(output, n_to_print=10)
{'_id': {'category': 'medicine', 'year': 1913}, 'awards': 1} {'_id': {'category': 'peace', 'year': 1954}, 'awards': 1} {'_id': {'category': 'physics', 'year': 2014}, 'awards': 3} {'_id': {'category': 'literature', 'year': 1990}, 'awards': 1} {'_id': {'category': 'chemistry', 'year': 1981}, 'awards': 2} {'_id': {'category': 'chemistry', 'year': 2004}, 'awards': 3} {'_id': {'category': 'chemistry', 'year': 1937}, 'awards': 2} {'_id': {'category': 'peace', 'year': 1992}, 'awards': 1} {'_id': {'category': 'chemistry', 'year': 2020}, 'awards': 2} {'_id': {'category': 'peace', 'year': 1901}, 'awards': 2}
# F
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": {"category": "$category",
"year": "$year"},
"awards": {"$sum" : 1}}},
{"$sort" : {"awards": -1}}])
pretty_print(output, n_to_print=15)
{'_id': {'category': 'physics', 'year': 2014}, 'awards': 3} {'_id': {'category': 'chemistry', 'year': 2004}, 'awards': 3} {'_id': {'category': 'medicine', 'year': 2009}, 'awards': 3} {'_id': {'category': 'physics', 'year': 2016}, 'awards': 3} {'_id': {'category': 'medicine', 'year': 1988}, 'awards': 3} {'_id': {'category': 'physics', 'year': 1981}, 'awards': 3} {'_id': {'category': 'medicine', 'year': 1956}, 'awards': 3} {'_id': {'category': 'peace', 'year': 2011}, 'awards': 3} {'_id': {'category': 'medicine', 'year': 1962}, 'awards': 3} {'_id': {'category': 'medicine', 'year': 1981}, 'awards': 3} {'_id': {'category': 'medicine', 'year': 2011}, 'awards': 3} {'_id': {'category': 'chemistry', 'year': 1996}, 'awards': 3} {'_id': {'category': 'physics', 'year': 1903}, 'awards': 3} {'_id': {'category': 'economics', 'year': 2013}, 'awards': 3} {'_id': {'category': 'physics', 'year': 1972}, 'awards': 3}
# G
output = db.prizes.aggregate([{"$unwind": "$laureates"},
{"$group": {"_id": {"category": "$category",
"year": "$year"},
"awards": {"$sum" : 1}}},
{"$group": {"_id":"$_id.category",
"avgawards": {"$avg" : "$awards"}}}])
pretty_print(output, n_to_print=-1)
{'_id': 'chemistry', 'avgawards': 1.6607142857142858} {'_id': 'economics', 'avgawards': 1.6538461538461537} {'_id': 'medicine', 'avgawards': 2.0} {'_id': 'physics', 'avgawards': 1.894736842105263} {'_id': 'literature', 'avgawards': 1.0353982300884956} {'_id': 'peace', 'avgawards': 1.3366336633663367}