Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions statvar_imports/tuberculosis_percentage/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# WHO Tuberculosis Percentage Dataset
## Overview
This dataset provides the percentage of people diagnosed with a new episode of pulmonary TB whose disease was bacteriologically confirmed, sourced from the World Health Organization (WHO) Global Tuberculosis Programme.

## Data Source

**Source URL:**
https://data.who.int/indicators/i/1891124/449F55C

The data is fetched from the WHO's official Global Tuberculosis Database via their public API.

## How To Download Input Data
To download the latest data, use the provided download script `download_who_tuberculosis.py`. This script fetches the data from the WHO API and merges it with country ISO3 codes to generate `tuberculosisPercentage_input.csv`.

**Type of place:** Country.

**Statvars:** Tuberculosis - Bacteriologically Confirmed Percentage.

**Years:** 1999 to 2024.

## Processing Instructions
To process the Tuberculosis data and generate statistical variables, use the following commands from the project's root `data` directory:

**Download input file**
```bash
python3 statvar_imports/tuberculosis_percentage/download_who_tuberculosis.py
```

**For Test Data Run**
```bash
python3 tools/statvar_importer/stat_var_processor.py \
--input_data=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_input.csv \
--pv_map=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_pvmap.csv \
--output_path=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_output \
--config_file=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_metadata.csv \
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
```

**For Main data run**
```bash
python3 tools/statvar_importer/stat_var_processor.py \
--input_data=statvar_imports/tuberculosis_percentage/source_files/tuberculosisPercentage_input.csv \
--pv_map=statvar_imports/tuberculosis_percentage/tuberculosisPercentage_pvmap.csv \
--output_path=statvar_imports/tuberculosis_percentage/tuberculosisPercentage_output \
--config_file=statvar_imports/tuberculosis_percentage/tuberculosisPercentage_metadata.csv \
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import requests
import io
import pandas as pd

def download_tb_percentage_data():
# 1. Get the Clean Data from the API using the new Indicator ID
api_url = "https://xmart-api-public.who.int/DATA_/RELAY_TB_DATA"
params = {
"$filter": "IND_ID eq '1891124449F55C'",
"$select": "IND_ID,INDICATOR_NAME,YEAR,COUNTRY,VALUE",
"$format": "csv"
}

print("1. Fetching clean percentage data from WHO API...")
api_response = requests.get(api_url, params=params)

if api_response.status_code != 200:
print(f"Failed to fetch API data. HTTP {api_response.status_code}")
return
Comment thread
shvngisingh marked this conversation as resolved.

# Load the clean API data into a pandas table
api_df = pd.read_csv(io.StringIO(api_response.text))

# 2. Get ONLY the iso3 code from the master database
print("2. Fetching country iso3 codes from WHO master database...")
master_url = "https://extranet.who.int/tme/generateCSV.asp?ds=notifications"

# We only pull the 'country' (for matching) and 'iso3' columns
geo_columns = ['country', 'iso3']
master_df = pd.read_csv(master_url, usecols=geo_columns).drop_duplicates()
Comment thread
shvngisingh marked this conversation as resolved.

# 3. Merge the two datasets together based on the country name
print("3. Merging data and formatting...")
# The API uses uppercase 'COUNTRY', the master uses lowercase 'country'
merged_df = pd.merge(api_df, master_df, left_on='COUNTRY', right_on='country', how='left')

# Drop the duplicate lowercase 'country' column used for joining
merged_df = merged_df.drop(columns=['country'])

# Reorder columns so the iso3 code sits right next to the Country name
final_columns = [
'IND_ID', 'INDICATOR_NAME', 'YEAR', 'COUNTRY', 'iso3', 'VALUE'
]
merged_df = merged_df[final_columns]

# 4. Save to CSV in a new folder
output_dir = "statvar_imports/tuberculosis_percentage/input_files"
filename = os.path.join(output_dir, "tuberculosisPercentage_input.csv")

os.makedirs(output_dir, exist_ok=True)

# Save without the pandas index column
merged_df.to_csv(filename, index=False)
print(f"Success! Data saved locally as '{filename}'")

if __name__ == "__main__":
download_tb_percentage_data()
27 changes: 27 additions & 0 deletions statvar_imports/tuberculosis_percentage/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"import_specifications": [
{
"import_name": "WHO_TuberculosisBacteriologicallyConfirmedPercentage",
"curator_emails": [
"support@datacommons.org"
],
"provenance_url": "https://data.who.int/indicators/i/1891124/449F55C",
"provenance_description": "Percentage of people diagnosed with a new episode of pulmonary TB whose disease was bacteriologically confirmed",
"scripts": [
"download_who_tuberculosis.py",
"../../tools/statvar_importer/stat_var_processor.py --input_data=tuberculosisPercentage_input.csv --pv_map=tuberculosisPercentage_pvmap.csv --config_file=tuberculosisPercentage_metadata.csv --output_path=tuberculosisPercentage_output"
],
"source_files": [
"input_files/*"
],
"import_inputs": [
{
"template_mcf": "tuberculosisPercentage_output.tmcf",
"cleaned_csv": "tuberculosisPercentage_output.csv",
"stat_var_mcf": "tuberculosisPercentage_output_stat_vars.mcf"
}
],
"cron_schedule": "0 0 1 1,4,7,10 *"
}
]
}
Loading