Examples: Census Data

This section shows how to explore and analyze the 2018 Census microdata without maps. To combine census data with geographic visualizations, see Census + Maps.


Explore available variables

Use the describe_* functions to learn what variables are available and what values they take.

import geoquetzal as gq

# List all variables in the household dataset
gq.describe_hogares()

# Details for a specific variable
gq.describe_hogares("PCH4")
# Returns label, type, and coded values

# Person-level variables
gq.describe_personas("PCP12")
# Ethnic self-identification: Maya, Garífuna, Xinka, Ladino...
library(geoquetzal)

# List all variables in the household dataset
describe_hogares()

# Details for a specific variable
describe_hogares("PCH4")
# Returns label, type, and coded values

# Person-level variables
describe_personas("PCP12")
# Ethnic self-identification: Maya, Garífuna, Xinka, Ladino...

Basic services access by department

What percentage of households has access to electricity in each department?

import geoquetzal as gq

# Download all households (~38 MB)
df = gq.hogares()

# PCH8 = lighting type; 1 = Electric grid
electricidad = (
    df.groupby("departamento")["PCH8"]
    .apply(lambda x: (x == 1).mean() * 100)
    .round(1)
    .reset_index(name="pct_electricity")
    .sort_values("pct_electricity", ascending=False)
)
print(electricidad.to_string(index=False))
library(geoquetzal)
library(dplyr)

# Download all households (~38 MB)
df <- hogares()

# PCH8 = lighting type; 1 = Electric grid
electricidad <- df |>
  group_by(departamento) |>
  summarise(pct_electricity = round(mean(PCH8 == 1, na.rm = TRUE) * 100, 1)) |>
  arrange(desc(pct_electricity))

print(electricidad)

Ethnic self-identification in Sacatepéquez

import geoquetzal as gq

df = gq.personas(departamento="Sacatepequez")

# PCP12: ethnic self-identification
# 1=Maya, 2=Garífuna, 3=Xinka, 4=Afrodescendiente, 5=Ladino, 6=Extranjero
etnicidad = (
    df["PCP12"]
    .value_counts(normalize=True)
    .mul(100)
    .round(1)
    .rename({1: "Maya", 2: "Garífuna", 3: "Xinka",
             4: "Afrodescendiente", 5: "Ladino", 6: "Extranjero"})
)
print(etnicidad)
library(geoquetzal)
library(dplyr)

df <- personas(departamento = "Sacatepequez")

# PCP12: ethnic self-identification
# 1=Maya, 2=Garífuna, 3=Xinka, 4=Afrodescendiente, 5=Ladino, 6=Extranjero
etnicidad <- df |>
  count(PCP12) |>
  mutate(
    pct = round(n / sum(n) * 100, 1),
    ethnicity = recode(as.character(PCP12),
      "1" = "Maya", "2" = "Garífuna", "3" = "Xinka",
      "4" = "Afrodescendiente", "5" = "Ladino", "6" = "Extranjero")
  ) |>
  select(ethnicity, pct) |>
  arrange(desc(pct))

print(etnicidad)

Dominant mother tongue per municipality

import geoquetzal as gq

df = gq.personas(departamento="Huehuetenango")

# PCP15: language learned to speak in
# Mode per municipality = most frequent language
idioma_dominante = (
    df.groupby("MUNICIPIO")["PCP15"]
    .agg(lambda x: int(x.dropna().mode()[0]) if not x.dropna().empty else None)
    .reset_index(name="idioma_dominante")
)

# Replace codes with labels
valores = gq.describe_personas("PCP15")["valores"]
idioma_dominante["idioma_dominante"] = idioma_dominante["idioma_dominante"].map(valores)
print(idioma_dominante.to_string(index=False))
library(geoquetzal)
library(dplyr)

df <- personas(departamento = "Huehuetenango")

# PCP15: language learned to speak in
# Mode per municipality = most frequent language
idioma_dominante <- df |>
  group_by(MUNICIPIO) |>
  summarise(idioma_dominante = as.integer(names(which.max(table(PCP15))))) |>
  left_join(
    data.frame(
      idioma_dominante = as.integer(names(describe_personas("PCP15")$valores)),
      label = unlist(describe_personas("PCP15")$valores)
    ),
    by = "idioma_dominante"
  )

print(idioma_dominante)

Average years of education per municipality

import geoquetzal as gq
import pandas as pd

df = gq.personas(departamento="Sacatepequez")

# ANEDUCA is stored as VARCHAR, cast before aggregating
df["ANEDUCA"] = pd.to_numeric(df["ANEDUCA"], errors="coerce")

aneduca = (
    df.groupby("MUNICIPIO")["ANEDUCA"]
    .mean()
    .round(1)
    .reset_index(name="aneduca_promedio")
    .sort_values("aneduca_promedio", ascending=False)
)
print(aneduca.to_string(index=False))
library(geoquetzal)
library(dplyr)

df <- personas(departamento = "Sacatepequez")

# ANEDUCA: years of education (stored as character — cast before aggregating)
aneduca <- df |>
  mutate(ANEDUCA = as.numeric(ANEDUCA)) |>
  group_by(MUNICIPIO) |>
  summarise(avg_years_education = round(mean(ANEDUCA, na.rm = TRUE), 1)) |>
  arrange(desc(avg_years_education))

print(aneduca)

Digital divide — technology access

What percentage of households has computer and internet access in Quiché?

import geoquetzal as gq
import pandas as pd

df = gq.hogares(departamento="Quiche")

# PCH9_H = computer, PCH9_I = internet, PCH9_M = car (1=Yes, 2=No)
tecnologia = pd.DataFrame({
    "Indicator": ["Computer", "Internet", "Car"],
    "% households": [
        (df["PCH9_H"] == 1).mean() * 100,
        (df["PCH9_I"] == 1).mean() * 100,
        (df["PCH9_M"] == 1).mean() * 100,
    ]
}).round(1)
print(tecnologia.to_string(index=False))
library(geoquetzal)

df <- hogares(departamento = "Quiche")

# PCH9_H = computer, PCH9_I = internet, PCH9_M = car (1=Yes, 2=No)
tecnologia <- data.frame(
  Indicator    = c("Computer", "Internet", "Car"),
  pct_households = round(c(
    mean(df$PCH9_H == 1, na.rm = TRUE) * 100,
    mean(df$PCH9_I == 1, na.rm = TRUE) * 100,
    mean(df$PCH9_M == 1, na.rm = TRUE) * 100
  ), 1)
)
print(tecnologia)

Sub-municipal data with lugares poblados

Pre-aggregated lugar poblado data enables sub-municipal analysis without processing microdata.

import geoquetzal as gq

# Load indicators for Antigua Guatemala
lp = gq.lugares_poblados(municipio="Antigua Guatemala")

# Compute % of households with internet per lugar poblado
lp["pct_internet"] = (
    lp["pch9_i_si"] / (lp["pch9_i_si"] + lp["pch9_i_no"]) * 100
).round(1)

# Top 5 lugares by internet access
print(
    lp[["nombre", "poblacion_total", "pct_internet"]]
    .sort_values("pct_internet", ascending=False)
    .head(5)
    .to_string(index=False)
)

# Look up what a column means
gq.describe_lugares_poblados("pch9_i_si")
# {'variable': 'pch9_i_si',
#  'etiqueta': 'Conteo: tiene internet',
#  'tipo': 'equipamiento',
#  'fuente': 'hogar'}
library(geoquetzal)
library(dplyr)

# Load indicators for Antigua Guatemala
lp <- lugares_poblados(municipio = "Antigua Guatemala")

# Compute % of households with internet per lugar poblado
lp$pct_internet <- round(
  lp$pch9_i_si / (lp$pch9_i_si + lp$pch9_i_no) * 100,
  1
)

# Top 5 lugares by internet access
lp |>
  select(nombre, poblacion_total, pct_internet) |>
  arrange(desc(pct_internet)) |>
  head(5)

# Look up what a column means
describe_lugares_poblados("pch9_i_si")
# $variable: "pch9_i_si"
# $etiqueta: "Conteo: tiene internet"
# $tipo: "equipamiento"
# $fuente: "hogar"