Skip to content

Commit

Permalink
feat: sanitize column names to follow defined standards (#313)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sujanadh authored Nov 15, 2024
1 parent 9287ad9 commit 46555e7
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 18 deletions.
64 changes: 46 additions & 18 deletions osm_fieldwork/update_xlsform.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,36 @@
SURVEY_GROUP_NAME = "survey_questions"


def standardize_xlsform_sheets(xlsform: dict) -> dict:
"""Standardizes column headers in both the 'survey' and 'choices' sheets of an XLSForm.
- Strips spaces and lowercases all column headers.
- Fixes formatting for columns with '::' (e.g., multilingual labels).
Args:
xlsform (dict): A dictionary with keys 'survey' and 'choices', each containing a DataFrame.
Returns:
dict: The updated XLSForm dictionary with standardized column headers.
"""

def clean_column_name(col_name):
if col_name == "label":
return "label::english(en)"
if "::" in col_name:
# Handle '::' columns (e.g., 'label::english (en)')
parts = col_name.split("::")
language_part = parts[1].replace(" ", "").lower() # Remove spaces and lowercase
return f"{parts[0]}::{language_part}"
return col_name.strip().lower() # General cleanup

# Apply cleaning to each sheet
for _sheet_name, sheet_df in xlsform.items():
sheet_df.columns = [clean_column_name(col) for col in sheet_df.columns]

return xlsform


def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame):
"""Merge multiple Pandas dataframes together, removing duplicate fields."""
# Remove empty rows from dataframes
Expand Down Expand Up @@ -92,8 +122,8 @@ def handle_translations(

if field in user_question_df.columns and not translation_columns:
# If user_question_df has only the base field (e.g., 'label'), map English translation from mandatory and digitisation
mandatory_df[field] = mandatory_df.get(f"{field}::English(en)", mandatory_df.get(field))
digitisation_df[field] = digitisation_df.get(f"{field}::English(en)", digitisation_df.get(field))
mandatory_df[field] = mandatory_df.get(f"{field}::english(en)", mandatory_df.get(field))
digitisation_df[field] = digitisation_df.get(f"{field}::english(en)", digitisation_df.get(field))

# Then drop translation columns
mandatory_df = mandatory_df.loc[:, ~mandatory_df.columns.str.startswith("label::")]
Expand Down Expand Up @@ -133,10 +163,10 @@ def create_survey_group(name: str) -> dict[str, pd.DataFrame]:
{
"type": ["begin group"],
"name": [name],
"label::English(en)": [name],
"label::Swahili(sw)": [name],
"label::French(fr)": [name],
"label::Spanish(es)": [name],
"label::english(en)": [name],
"label::swahili(sw)": [name],
"label::french(fr)": [name],
"label::spanish(es)": [name],
"relevant": "(${new_feature} != '') or (${building_exists} = 'yes')",
}
)
Expand All @@ -158,20 +188,16 @@ def append_select_one_from_file_row(df: pd.DataFrame, entity_name: str) -> pd.Da

# Find the row index after 'feature' row
row_index_to_split_on = select_one_from_file_index[0] + 1
# Strip the 's' from the end for singular form
if entity_name.endswith("s"):
# Plural to singular
entity_name = entity_name[:-1]

additional_row = pd.DataFrame(
{
"type": [f"select_one_from_file {entity_name}.csv"],
"name": [entity_name],
"label::English(en)": [entity_name],
"label::english(en)": [entity_name],
"appearance": ["map"],
"label::Swahili(sw)": [entity_name],
"label::French(fr)": [entity_name],
"label::Spanish(es)": [entity_name],
"label::swahili(sw)": [entity_name],
"label::french(fr)": [entity_name],
"label::spanish(es)": [entity_name],
}
)

Expand Down Expand Up @@ -205,12 +231,14 @@ async def append_mandatory_fields(
custom_sheets = pd.read_excel(custom_form, sheet_name=None, engine="calamine")
mandatory_sheets = pd.read_excel(f"{xlsforms_path}/common/mandatory_fields.xls", sheet_name=None, engine="calamine")
digitisation_sheets = pd.read_excel(f"{xlsforms_path}/common/digitisation_fields.xls", sheet_name=None, engine="calamine")
custom_sheets = standardize_xlsform_sheets(custom_sheets)

# Merge 'survey' and 'choices' sheets
if "survey" not in custom_sheets:
msg = "Survey sheet is required in XLSForm!"
log.error(msg)
raise ValueError(msg)

log.debug("Merging survey sheet XLSForm data")
custom_sheets["survey"] = merge_dataframes(
mandatory_sheets.get("survey"), custom_sheets.get("survey"), digitisation_sheets.get("survey")
Expand All @@ -223,10 +251,10 @@ async def append_mandatory_fields(
if not form_category_row.empty:
custom_sheets["survey"].loc[custom_sheets["survey"]["name"] == "form_category", "calculation"] = f"once('{form_category}')"

if "choices" not in custom_sheets:
msg = "Choices sheet is required in XLSForm!"
log.error(msg)
raise ValueError(msg)
# Ensure the 'choices' sheet exists in custom_sheets
if "choices" not in custom_sheets or custom_sheets["choices"] is None:
custom_sheets["choices"] = pd.DataFrame(columns=["list_name", "name", "label::english(en)"])

log.debug("Merging choices sheet XLSForm data")
custom_sheets["choices"] = merge_dataframes(
mandatory_sheets.get("choices"), custom_sheets.get("choices"), digitisation_sheets.get("choices")
Expand Down
Binary file modified osm_fieldwork/xlsforms/common/digitisation_fields.xls
Binary file not shown.
Binary file modified osm_fieldwork/xlsforms/common/mandatory_fields.xls
Binary file not shown.

0 comments on commit 46555e7

Please sign in to comment.