USDA Food Database¶

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
from pathlib import Path

sys.path.append("../")

from helpers.utils import download_by_url
In [2]:
url = "https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/datasets/usda_food/database.json"
DATASET_DIR = download_by_url(url, "../input/usa_food_database/database.json")
File already downloaded to: ../input/usa_food_database/database.json
In [3]:
import json

db = json.load(open(Path(DATASET_DIR) / "database.json"))
len(db)
Out[3]:
6636
In [4]:
db[0].keys()
Out[4]:
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
In [5]:
db[0]["nutrients"][0]
Out[5]:
{'value': 25.18,
 'units': 'g',
 'description': 'Protein',
 'group': 'Composition'}
In [6]:
nutrients = pd.DataFrame(db[0]["nutrients"])
nutrients.head()
Out[6]:
value units description group
0 25.18 g Protein Composition
1 29.20 g Total lipid (fat) Composition
2 3.06 g Carbohydrate, by difference Composition
3 3.28 g Ash Other
4 376.00 kcal Energy Energy
In [7]:
info_keys = ["description", "group", "id", "manufacturer"]
info = pd.DataFrame(db, columns=info_keys)
info.head()
Out[7]:
description group id manufacturer
0 Cheese, caraway Dairy and Egg Products 1008
1 Cheese, cheddar Dairy and Egg Products 1009
2 Cheese, edam Dairy and Egg Products 1018
3 Cheese, feta Dairy and Egg Products 1019
4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028
In [8]:
info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   description   6636 non-null   object
 1   group         6636 non-null   object
 2   id            6636 non-null   int64 
 3   manufacturer  5195 non-null   object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB
In [9]:
pd.value_counts(info["group"])[:10]
C:\Users\purch\AppData\Local\Temp\ipykernel_15164\3634668934.py:1: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.
  pd.value_counts(info["group"])[:10]
Out[9]:
group
Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: count, dtype: int64
In [10]:
nutrients = []
for rec in db:
    fnuts = pd.DataFrame(rec["nutrients"])
    fnuts["id"] = rec["id"]
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
nutrients.head()
Out[10]:
value units description group id
0 25.18 g Protein Composition 1008
1 29.20 g Total lipid (fat) Composition 1008
2 3.06 g Carbohydrate, by difference Composition 1008
3 3.28 g Ash Other 1008
4 376.00 kcal Energy Energy 1008
In [11]:
nutrients.duplicated().sum() # number of duplicates
Out[11]:
np.int64(14179)
In [12]:
nutrients = nutrients.drop_duplicates()
In [13]:
col_mapping = {"description": "food", "group": "fgroup"}
info = info.rename(columns=col_mapping, copy=False)
info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   food          6636 non-null   object
 1   fgroup        6636 non-null   object
 2   id            6636 non-null   int64 
 3   manufacturer  5195 non-null   object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB
In [14]:
col_mapping = {"description": "nutrient", "group": "nutgroup"}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients.info()
<class 'pandas.core.frame.DataFrame'>
Index: 375176 entries, 0 to 389354
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   value     375176 non-null  float64
 1   units     375176 non-null  object 
 2   nutrient  375176 non-null  object 
 3   nutgroup  375176 non-null  object 
 4   id        375176 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 17.2+ MB
In [15]:
nutrients.head()
Out[15]:
value units nutrient nutgroup id
0 25.18 g Protein Composition 1008
1 29.20 g Total lipid (fat) Composition 1008
2 3.06 g Carbohydrate, by difference Composition 1008
3 3.28 g Ash Other 1008
4 376.00 kcal Energy Energy 1008
In [16]:
ndata = pd.merge(nutrients, info, on="id", how="inner", validate="many_to_one")
ndata.head()
Out[16]:
value units nutrient nutgroup id food fgroup manufacturer
0 25.18 g Protein Composition 1008 Cheese, caraway Dairy and Egg Products
1 29.20 g Total lipid (fat) Composition 1008 Cheese, caraway Dairy and Egg Products
2 3.06 g Carbohydrate, by difference Composition 1008 Cheese, caraway Dairy and Egg Products
3 3.28 g Ash Other 1008 Cheese, caraway Dairy and Egg Products
4 376.00 kcal Energy Energy 1008 Cheese, caraway Dairy and Egg Products
In [17]:
ndata.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375176 entries, 0 to 375175
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   value         375176 non-null  float64
 1   units         375176 non-null  object 
 2   nutrient      375176 non-null  object 
 3   nutgroup      375176 non-null  object 
 4   id            375176 non-null  int64  
 5   food          375176 non-null  object 
 6   fgroup        375176 non-null  object 
 7   manufacturer  293054 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 22.9+ MB
In [18]:
ndata.iloc[30000]
Out[18]:
value                                             0.04
units                                                g
nutrient                                       Glycine
nutgroup                                   Amino Acids
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer                                          
Name: 30000, dtype: object
In [19]:
result = ndata.groupby(["nutrient", "fgroup"])["value"].quantile(0.5)
result.head(10)
Out[19]:
nutrient          fgroup                           
Adjusted Protein  Sweets                               12.900
                  Vegetables and Vegetable Products     2.180
Alanine           Baby Foods                            0.085
                  Baked Products                        0.248
                  Beef Products                         1.550
                  Beverages                             0.003
                  Breakfast Cereals                     0.311
                  Cereal Grains and Pasta               0.373
                  Dairy and Egg Products                0.271
                  Ethnic Foods                          1.290
Name: value, dtype: float64
In [20]:
result["Zinc, Zn"].sort_values().plot(kind="barh", figsize=(10, 12))
Out[20]:
<Axes: ylabel='fgroup'>
No description has been provided for this image
In [21]:
by_nutrient = ndata.groupby(["nutgroup", "nutrient"])
by_nutrient.size().head(10)
Out[21]:
nutgroup     nutrient      
Amino Acids  Alanine           4082
             Arginine          4127
             Aspartic acid     4085
             Cystine           4084
             Glutamic acid     4086
             Glycine           4083
             Histidine         4135
             Hydroxyproline     853
             Isoleucine        4142
             Leucine           4142
dtype: int64
In [22]:
def get_maximum(x):
    return x.loc[x.value.idxmax()]
max_foods = by_nutrient.apply(get_maximum)[["value", "food"]]

# make the food a little smaller
max_foods["food"] = max_foods["food"].str[:50]
max_foods.head(10)
C:\Users\purch\AppData\Local\Temp\ipykernel_15164\330857542.py:3: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  max_foods = by_nutrient.apply(get_maximum)[["value", "food"]]
Out[22]:
value food
nutgroup nutrient
Amino Acids Alanine 8.009 Gelatins, dry powder, unsweetened
Arginine 7.436 Seeds, sesame flour, low-fat
Aspartic acid 10.203 Soy protein isolate
Cystine 1.307 Seeds, cottonseed flour, low fat (glandless)
Glutamic acid 17.452 Soy protein isolate
Glycine 19.049 Gelatins, dry powder, unsweetened
Histidine 2.999 Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline 0.803 KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine 4.300 Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine 7.200 Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
In [23]:
max_foods.loc["Amino Acids"]["food"].head(10)
Out[23]:
nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object
In [ ]: