Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# CDCWonder_NNDSS_Infectious_Weekly

## Overview
Notifiable Infectious Diseases Data: Weekly tables from CDC WONDER which has the incident counts of different infectious diseases per Previous 52 week that are reported by the 50 states, New York City, the District of Columbia, and the U.S. territories.

## Data Source
**Source URL:**
`https://data.cdc.gov/api/views/x9gk-5huc/rows.csv?accessType=DOWNLOAD&api_foundry=true`

## How To Download Input Data
To download and process the data, you'll need to run the provided preprocess script, `preprocess.py`. This script will automatically create an "input_files" folder where you should place the file to be processed.By using this script, we are creating one more columns in the input files such as 'observationDate'.

statvars: Infectious Diseases

## Download the data:
For download and preprocess the source data, run:
```python3 preprocess.py```

## Processing Instructions
To process data and generate statistical variables, use the following command from the "data" directory:

**For Test Data Run**
```
python3 tools/statvar_importer/stat_var_processor.py \
--input_data=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/testdata/NNDSS_Weekly_Data.csv \
--pv_map=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/nndss_weekly_pvmap.csv \
--config_file=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/nndss_weekly_metadata.csv \
--output_path=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/testdata/nndss_weekly_output
```

**For Main data run**
```bash
python3 tools/statvar_importer/stat_var_processor.py \
--input_data=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/input_files/NNDSS_Weekly_Data.csv \
--pv_map=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/nndss_weekly_pvmap.csv \
--config_file=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/nndss_weekly_metadata.csv \
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf \
--output_path=statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/output/nndss_weekly_output
```
36 changes: 36 additions & 0 deletions statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"import_specifications": [
{
"import_name": "CDCWonder_NNDSS_Infectious_Weekly",
"curator_emails": [
"support@datacommons.org"
],
"provenance_url": "https://data.cdc.gov/api/views/x9gk-5huc/rows.csv?accessType=DOWNLOAD&api_foundry=true",
"provenance_description": "Notifiable Infectious Diseases Data: Weekly tables from CDC WONDER which has the incident counts of different infectious diseases per week that are reported by the 50 states, New York City, the District of Columbia, and the U.S. territories.",
"scripts": [
"preprocess.py",
"python ../../../tools/statvar_importer/stat_var_processor.py --input_data=input_files/NNDSS_Weekly_Data.csv --pv_map='nndss_weekly_pvmap.csv' --config_file=nndss_weekly_metadata.csv --output_path=output/nndss_weekly_output"
],
"import_inputs": [
{
"template_mcf": "output/nndss_weekly_output.tmcf",
"cleaned_csv": "output/nndss_weekly_output.csv",
"node_mcf": "output/*.mcf"
}
],
"source_files": [
"input_files/NNDSS_Weekly_Data.csv"
],
"cron_schedule": "00 11 1,15 * *",
"resource_limits": {"cpu": 4, "memory": 8, "disk": 100},
"config_override": {
"invoke_import_validation": true,
"invoke_import_tool": true,
"invoke_differ_tool": true,
"skip_input_upload": false,
"skip_gcs_upload": false,
"cleanup_gcs_volume_mount": false
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
parameter,val
mapped_rows,1
mapped_columns,5
header_rows,1
#places_resolved_csv,
input_columns,8
#input_rows,1000

Large diffs are not rendered by default.

139 changes: 139 additions & 0 deletions statvar_imports/cdc/CDCWonder_NNDSS_InfectiousWeekly/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os, sys
import pandas as pd
from absl import app, logging
from pathlib import Path
import datetime

script_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(script_dir, '../../../util'))
from download_util_script import download_file
INPUT_DIR = os.path.join(script_dir, "input_files")
Path(INPUT_DIR).mkdir(parents=True, exist_ok=True)
INPUT_FILE = os.path.join(INPUT_DIR, "rows.csv")
NEW_FILE = os.path.join(INPUT_DIR, "NNDSS_Weekly_Data.csv")
SOURCE_URL = "https://data.cdc.gov/api/views/x9gk-5huc/rows.csv?accessType=DOWNLOAD&api_foundry=true"


def _start_date_of_year(year: int) -> datetime.date:
"""
Return start date of the year using MMWR week rules.

A year's first MMWR week is the first full calendar week (Sunday-Saturday)
that contains at least four days of the new year. This means the start
date of the first MMWR week can be in the previous calendar year.
"""
jan_one = datetime.date(year, 1, 1)

# Calculate the day difference to get to the first Sunday of the year.
# The condition jan_one.isoweekday() > 3 accounts for the rule that
# Jan 1 must be in the first week if it falls on a Mon, Tue, Wed, or Thu.
diff = 7 * (jan_one.isoweekday() > 3) - jan_one.isoweekday()

return jan_one + datetime.timedelta(days=diff)

def get_mmwr_week_start_date(year, week) -> datetime.date:
"""
Return the start date of an MMWR week (starts at Sunday).
The provided code originally had 'end_date' in the name but calculated
the start date. To maintain the original logic, this returns the start date.
"""
day_one = _start_date_of_year(year)
diff = 7 * (week - 1)
return day_one + datetime.timedelta(days=diff)

def preprocess_data(filepath: str):
"""
Reads a CSV file, adds a new column 'observationDate' by calculating
the MMWR week start date, and saves the changes back to the same file.
The new column is placed immediately after the 'MMWR WEEK' column.

Args:
filepath (str): The path to the CSV file to read and update.
"""
try:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(filepath)

# Define the required column names.
required_cols = ['Current MMWR Year', 'MMWR WEEK']

# Check for the required columns. A KeyError will be raised if they don't exist.
if not all(col in df.columns for col in required_cols):
# This check is good practice, although the KeyError below would also catch it.
raise KeyError(f"The file must contain the columns: {required_cols}.")

# Use a vectorized operation with .apply() for better performance.
# This applies the get_mmwr_week_end_date function to each row.
df['observationDate'] = df.apply(
lambda row: get_mmwr_week_start_date(row['Current MMWR Year'], row['MMWR WEEK']),
axis=1
)

# Reorder the columns to place 'observationDate' after 'MMWR WEEK'
cols = list(df.columns)
mmwr_week_index = cols.index('MMWR WEEK')
# Move 'observationDate' from the end to its new position
observation_date_col = cols.pop() # It was added as the last column
cols.insert(mmwr_week_index + 1, observation_date_col)
# Reassign the DataFrame with the new column order
df = df[cols]
# Save the updated DataFrame back to the same CSV file
df.to_csv(filepath, index=False)
logging.info(f"Success: File '{filepath}' has been updated and saved.")

except FileNotFoundError:
logging.fatal(f"Error: The file '{filepath}' was not found.")
raise RuntimeError(f"Error: The file '{filepath}' was not found.")
except KeyError as e:
logging.fatal(f"Error: Missing a required column. Details: {e}")
raise RuntimeError(f"Error: Missing a required column. Details: {e}")
except Exception as e:
logging.fatal(f"An unexpected error occurred: {e}")
raise RuntimeError(f"Import job failed An unexpected error occurred: {e}")

def main(argv):
try:
download_file(url=SOURCE_URL,
output_folder=INPUT_DIR,
unzip=False,
headers= None,
tries= 3,
delay= 5,
backoff= 2)
except Exception as e:
logging.fatal(f"Failed to download NNDSS weekly data file,{e}")
raise RuntimeError(f"Failed to download NNDSS weekly data file,{e}")

# 1. Preprocess the data
preprocess_data(INPUT_FILE)

# 2. Rename the file
try:
if os.path.exists(INPUT_FILE):
# If NNDSS_Weekly_Data.csv already exists from a previous run, remove it first
if os.path.exists(NEW_FILE):
os.remove(NEW_FILE)
os.rename(INPUT_FILE, NEW_FILE)
logging.info(f"Successfully renamed 'rows.csv' to 'NNDSS_Weekly_Data.csv'")
else:
logging.warning(f"Could not rename file. '{INPUT_FILE}' does not exist.")
except Exception as e:
logging.fatal(f"Failed to rename file: {e}")
raise RuntimeError(f"Failed to rename file: {e}")

if __name__ == "__main__":
app.run(main)
Loading
Loading