Skip to main content Link Menu Expand (external link) Document Search Copy Copied
  • Home
  • Weekly Schedule
  • Syllabus
  • Staff
  • Assignment Tips
  • Optional Final Project
  • Resources
  • Acknowledgments
  • Course FAQ
  • Course Notes
View all course offerings
  • Course Notes
  • DataHub
  • Additional Extensions

Lecture 18: Mongo DB II¶

In [1]:
import json
import pymongo
import pprint
In [2]:
# instead of saving a full copy of existing data,
# just symlink to previous lecture. a bit hacky, but saves space!
!ln -sf ../lec17-mongo_i/data data
In [2]:
# this is a utility function we define so that find_all() prints nicely
def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return
In [ ]:

Aggregation Queries¶

Zips JSON from the MongoDB Aggregation Tutorial: https://www.mongodb.com/docs/manual/tutorial/aggregation-zip-code-data-set/

Load zips.json into new local.zips¶

For the sake of simplicity, we'll make a new collection zips in a new aggquerydb database.

In [3]:
# reimport/redefine as needed
import json
import pymongo
import pprint

def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return
In [5]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()
Out[5]:
['admin', 'config', 'local', 'nobel_prizes']
In [6]:
# run this cell to make the new collection
# and insert zipcode documents

client.drop_database('aggquerydb') # if already exists
db = client.aggquerydb

with open('data/zips.json', encoding='utf-8') as f:
    for line in f.readlines():
        db.zips.insert_one(json.loads(line))
In [7]:
db.zips.count_documents({})
Out[7]:
29353

$group¶

In [7]:
# Just the grouping, no filtering
output = db.zips.aggregate( [
    { "$group": { "_id": "$state",
              "totalPop":
                   { "$sum": "$pop" } } },
] )

pretty_print(output, n_to_print=5)
{'_id': 'ND', 'totalPop': 638272}
{'_id': 'AR', 'totalPop': 2350725}
{'_id': 'HI', 'totalPop': 1108229}
{'_id': 'WV', 'totalPop': 1793146}
{'_id': 'TX', 'totalPop': 16984601}
In [9]:
output = db.zips.aggregate( [
{ "$group": { "_id": "$state",
              "totalPop":
                   { "$sum": "$pop" } } },
{ "$match": { "totalPop":
            { "$gte": 15000000 } } },
{ "$sort" : { "totalPop" : -1 } }
] )

pretty_print(output, n_to_print=10)
{'_id': 'CA', 'totalPop': 29754890}
{'_id': 'NY', 'totalPop': 17990402}
{'_id': 'TX', 'totalPop': 16984601}

Aggregation Queries: Unwind/Lookup¶

Make a new collection, inventory.

In [10]:
db = client.aggquerydb          # stay in same database
db.drop_collection('inventory') # recreate as needed

db.inventory.insert_many( [
{ "item": "journal",
 "tags": ["blank", "red"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "A", "qty": 5 }, { "loc": "C", "qty": 15 } ]
},
{ "item": "notebook",
 "tags": ["red", "blank"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "C", "qty": 5 } ]
},
{ "item": "paper",
 "tags": ["red", "blank", "plain"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "A", "qty": 60 }, { "loc": "B", "qty": 15 } ]
},
{ "item": "planner",
 "tags": ["blank", "red"],
 "dim": [ 22.85, 30 ] ,
 "instock": [ { "loc": "A", "qty": 40 }, { "loc": "B", "qty": 5 } ]
},
{ "item": "postcard",
 "tags": ["blue"],
 "dim": [ 10, 15.25 ],
 "instock": [ { "loc": "E", "qty": 15 }, { "loc": "D", "qty": 35 } ]
}
]);
In [ ]:

In [11]:
pretty_print(db.inventory.find({}))
{'_id': ObjectId('6723bcb2c96a1d9cd406ab20'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'tags': ['blank', 'red']}
{'_id': ObjectId('6723bcb2c96a1d9cd406ab21'),
 'dim': [14, 21],
 'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'tags': ['red', 'blank']}
{'_id': ObjectId('6723bcb2c96a1d9cd406ab22'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}],
 'item': 'paper',
 'tags': ['red', 'blank', 'plain']}

Unwind example¶

In [12]:
# Notice, tags is no longer an array
output = db.inventory.aggregate( [
{ "$unwind" : "$tags" }
] )

pretty_print(output, n_to_print=4)
{'_id': ObjectId('6723bcb2c96a1d9cd406ab20'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'tags': 'blank'}
{'_id': ObjectId('6723bcb2c96a1d9cd406ab20'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'tags': 'red'}
{'_id': ObjectId('6723bcb2c96a1d9cd406ab21'),
 'dim': [14, 21],
 'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'tags': 'red'}
{'_id': ObjectId('6723bcb2c96a1d9cd406ab21'),
 'dim': [14, 21],
 'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'tags': 'blank'}
In [13]:
output = db.inventory.aggregate( [
{ "$unwind" : "$tags" },
{ "$project" : {"_id" : 0, "instock": 0}}
] )

pretty_print(output, n_to_print=-1)
{'dim': [14, 21], 'item': 'journal', 'tags': 'blank'}
{'dim': [14, 21], 'item': 'journal', 'tags': 'red'}
{'dim': [14, 21], 'item': 'notebook', 'tags': 'red'}
{'dim': [14, 21], 'item': 'notebook', 'tags': 'blank'}
{'dim': [14, 21], 'item': 'paper', 'tags': 'red'}
{'dim': [14, 21], 'item': 'paper', 'tags': 'blank'}
{'dim': [14, 21], 'item': 'paper', 'tags': 'plain'}
{'dim': [22.85, 30], 'item': 'planner', 'tags': 'blank'}
{'dim': [22.85, 30], 'item': 'planner', 'tags': 'red'}
{'dim': [10, 15.25], 'item': 'postcard', 'tags': 'blue'}
In [14]:
output = db.inventory.aggregate( [
    { "$unwind" : "$instock" },
    { "$group" : { "_id" : "$item", "totalqty" : {"$sum" : "$instock.qty"}}}
] )

pretty_print(output, n_to_print=-1)
{'_id': 'planner', 'totalqty': 45}
{'_id': 'journal', 'totalqty': 20}
{'_id': 'postcard', 'totalqty': 50}
{'_id': 'paper', 'totalqty': 75}
{'_id': 'notebook', 'totalqty': 5}
In [16]:
output = db.inventory.aggregate( [
{ "$lookup" : {
    "from" : "inventory",
    "localField": "instock.loc",
    "foreignField": "instock.loc",
    "as":"otheritems"}
},
{ "$project" :
    {"_id" : 0, "tags" : 0, "dim" : 0, "otheritems._id": 0}
}
] )

pretty_print(output, n_to_print=2)
{'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'otheritems': [{'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
                 'item': 'journal',
                 'tags': ['blank', 'red']},
                {'dim': [14, 21],
                 'instock': [{'loc': 'C', 'qty': 5}],
                 'item': 'notebook',
                 'tags': ['red', 'blank']},
                {'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}],
                 'item': 'paper',
                 'tags': ['red', 'blank', 'plain']},
                {'dim': [22.85, 30],
                 'instock': [{'loc': 'A', 'qty': 40}, {'loc': 'B', 'qty': 5}],
                 'item': 'planner',
                 'tags': ['blank', 'red']}]}
{'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'otheritems': [{'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
                 'item': 'journal',
                 'tags': ['blank', 'red']},
                {'dim': [14, 21],
                 'instock': [{'loc': 'C', 'qty': 5}],
                 'item': 'notebook',
                 'tags': ['red', 'blank']}]}

Multiple Attribute Grouping¶

In [17]:
# reimport/redefine as needed
import json
import pymongo
import pprint

def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return
In [19]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()
Out[19]:
['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes']
In [20]:
db = client.aggquerydb
In [24]:
db.zips.count_documents({})
Out[24]:
29353
In [18]:
db.zips.find_one()
Out[18]:
{'_id': '01001',
 'city': 'AGAWAM',
 'loc': [-72.622739, 42.070206],
 'pop': 15338,
 'state': 'MA'}
  1. What is this doing?
In [19]:
# Break things down by steps if necessary.
output = db.zips.aggregate( [
    { "$group": { "_id": { "state": "$state", "city": "$city" },
                 "pop": { "$sum": "$pop" } }
    }
] )
pretty_print(output, n_to_print=10)
{'_id': {'city': 'VALMEYER', 'state': 'IL'}, 'pop': 1841}
{'_id': {'city': 'PIEDMONT', 'state': 'MO'}, 'pop': 3719}
{'_id': {'city': 'HOPKINSVILLE', 'state': 'KY'}, 'pop': 39331}
{'_id': {'city': 'MINIER', 'state': 'IL'}, 'pop': 1483}
{'_id': {'city': 'PARAMUS', 'state': 'NJ'}, 'pop': 25085}
{'_id': {'city': 'SEVERNA PARK', 'state': 'MD'}, 'pop': 23392}
{'_id': {'city': 'HUNTINGTON', 'state': 'AR'}, 'pop': 2608}
{'_id': {'city': 'LUTHER', 'state': 'OK'}, 'pop': 2111}
{'_id': {'city': 'STATE CENTER', 'state': 'IA'}, 'pop': 1618}
{'_id': {'city': 'TAFT', 'state': 'OK'}, 'pop': 1822}
In [20]:
output = db.zips.aggregate( [
    { "$group": { "_id": { "state": "$state", "city": "$city" },
                 "pop": { "$sum": "$pop" } }
    },
    { "$group": { "_id": "$_id.state",
                 "avgCityPop": { "$avg": "$pop" } }
    }
] )
pretty_print(output, n_to_print=10)
{'_id': 'NY', 'avgCityPop': 13131.680291970803}
{'_id': 'MT', 'avgCityPop': 2593.987012987013}
{'_id': 'MO', 'avgCityPop': 5672.195338512764}
{'_id': 'CO', 'avgCityPop': 9981.075757575758}
{'_id': 'CA', 'avgCityPop': 27756.42723880597}
{'_id': 'KY', 'avgCityPop': 4767.164721141375}
{'_id': 'OK', 'avgCityPop': 6155.743639921722}
{'_id': 'NV', 'avgCityPop': 18209.590909090908}
{'_id': 'MD', 'avgCityPop': 12615.775725593667}
{'_id': 'DC', 'avgCityPop': 303450.0}
  1. What is this doing?
In [25]:
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
             "pop": { "$sum": "$pop" } } },
{ "$sort": { "pop": -1 } },
{ "$group": { "_id" : "$_id.state",
             "bigCity": { "$first": "$_id.city" },
             "bigPop": { "$first": "$pop" } } },
{ "$sort" : {"bigPop" : -1} },
{ "$project" : {"bigPop" : 0} }
] )

pretty_print(output, n_to_print=10)
{'_id': 'IL', 'bigCity': 'CHICAGO'}
{'_id': 'NY', 'bigCity': 'BROOKLYN'}
{'_id': 'CA', 'bigCity': 'LOS ANGELES'}
{'_id': 'TX', 'bigCity': 'HOUSTON'}
{'_id': 'PA', 'bigCity': 'PHILADELPHIA'}
{'_id': 'MI', 'bigCity': 'DETROIT'}
{'_id': 'AZ', 'bigCity': 'PHOENIX'}
{'_id': 'FL', 'bigCity': 'MIAMI'}
{'_id': 'MD', 'bigCity': 'BALTIMORE'}
{'_id': 'TN', 'bigCity': 'MEMPHIS'}
In [ ]:

  1. What is this doing?
In [27]:
output = db.zips.aggregate( [
{ "$group": { "_id": { "state": "$state", "city": "$city" },
             "pop": { "$sum": "$pop" } } },
{ "$sort": { "pop": -1 } },
{ "$group": { "_id" : "$_id.state",
             "bigCity": { "$first": "$_id.city" },
             "bigPop": { "$first": "$pop" } } },
{ "$sort" : {"bigPop" : -1} },
{ "$project" : { "_id" : 0, "state" : "$_id",
                "bigCityDeets": { "name": "$bigCity", "pop": "$bigPop" } } }
] )

pretty_print(output, n_to_print=10)
{'bigCityDeets': {'name': 'CHICAGO', 'pop': 2452177}, 'state': 'IL'}
{'bigCityDeets': {'name': 'BROOKLYN', 'pop': 2300504}, 'state': 'NY'}
{'bigCityDeets': {'name': 'LOS ANGELES', 'pop': 2102295}, 'state': 'CA'}
{'bigCityDeets': {'name': 'HOUSTON', 'pop': 2095918}, 'state': 'TX'}
{'bigCityDeets': {'name': 'PHILADELPHIA', 'pop': 1610956}, 'state': 'PA'}
{'bigCityDeets': {'name': 'DETROIT', 'pop': 963243}, 'state': 'MI'}
{'bigCityDeets': {'name': 'PHOENIX', 'pop': 890853}, 'state': 'AZ'}
{'bigCityDeets': {'name': 'MIAMI', 'pop': 825232}, 'state': 'FL'}
{'bigCityDeets': {'name': 'BALTIMORE', 'pop': 733081}, 'state': 'MD'}
{'bigCityDeets': {'name': 'MEMPHIS', 'pop': 632837}, 'state': 'TN'}
In [ ]:

[Extra] Aggregation Pipeline Demos¶

Nobel laureate data from Kaggle: https://www.kaggle.com/datasets/imdevskp/nobel-prize/data

In [30]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()

client.drop_database('nobel_prizes') # if already exists

db = client.nobel_prizes
# run this cell to insert into the collection prizes
with open('data/prize.json', encoding='utf-8') as f:
    db.prizes.insert_many(json.loads(f.read()))

print("databases", client.list_database_names())

db = client.nobel_prizes
print("collections in nobel_prizes database", db.list_collection_names())
databases ['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes']
collections in nobel_prizes database ['prizes']
In [ ]:

In [ ]:
# A
output = db.prizes.aggregate([{"$group": {"_id": "$category",
                                          "awardyears": {"$sum" : 1}}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# B
output = db.prizes.aggregate([{"$group": {"_id": "$category",
                                          "awardyears": {"$sum" : 1}}},
                              {"$match" : {"awardyears": {"$lt": 100}}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# C
output = db.prizes.aggregate([{"$group": {"_id": "$category",
                                 "awardyears": {"$sum" : 1}}},
                     {"$match" : {"awardyears": {"$lt": 100}}},
                     {"$project" : {"_id": 0, "awardyears": 1}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# D
output = db.prizes.aggregate([{"$unwind": "$laureates"},
                              {"$group": {"_id": "$category",
                                          "awards": {"$sum" : 1}}}])
pretty_print(output, n_to_print=-1)
In [ ]:
# E
output = db.prizes.aggregate([{"$unwind": "$laureates"},
                              {"$group": {"_id": {"category": "$category",
                                                  "year": "$year"},
                                          "awards": {"$sum" : 1}}}])
pretty_print(output, n_to_print=10)
In [ ]:
# F
output = db.prizes.aggregate([{"$unwind": "$laureates"},
                              {"$group": {"_id": {"category": "$category",
                                                  "year": "$year"},
                                          "awards": {"$sum" : 1}}},
                              {"$sort" : {"awards": -1}}])
pretty_print(output, n_to_print=15)
In [ ]:
# G
output = db.prizes.aggregate([{"$unwind": "$laureates"},
                              {"$group": {"_id": {"category": "$category",
                                                  "year": "$year"},
                                          "awards": {"$sum" : 1}}},
                              {"$group": {"_id":"$_id.category",
                                          "avgawards": {"$avg" : "$awards"}}}])
pretty_print(output, n_to_print=-1)

Back to Top

Accessibility Nondiscrimination

Copyright ©2025, Regents of the University of Californa and respective authors.

This site is built following the Berkeley Class Site template, which is generously based on the Just the Class, and Just the Docs templates.