Module Code: est_avg_age.py for Experiment 6 & 7

As promised in Estimate World Average Age: Part 5 here’s the current state of the code in my module population/est_avg_age.py. Most of the changes since the last posting cover experiments 6 & 7.

I have also included the code for my blog related module, population/play/t7_build_table.py. Just in case you are interested.

And, I once again I do apologize at how messy the est_avg_age.py module’s code is getting. Now, I expect I should in some fashion refactor the whole thing and I may, in future, do so. But for now, it is what it is. Too much going on in my life at the moment. Just struggling to get the posts written and the code working. And, yes the module documentation has not been kept up, or even attempted for most of the function definitions.

The code also contains numerous commented out debug statements (mostly print() statements). I was too lazy to got through the listing and remove them all. Sorry!

est_avg_age.py

"""Code for estimating the world's average (median) age by random sampling.
   Tied to blog post, world_avg_age_*/*.

   This is version 2, needed to rethink how I was doing things due to the time my sample generation
   could take. It is based on play/est_avg_age.test.py

   Planning to try and do it all in memory rather than repeated i/o.

   Will be repeating code in a lot of other modules, but, no way around that while I do my testing

   Actually !! going to generate a new csv file that has country and mod continent column from the latest csv file
   plus a new valid country name column that has the proper name from the population database csv. Will
   use that csv to generate random names for simulation.

File: py_play/population/est_avg_age.py

Module level and/or global variables:
----

  _CSV_NM: database population data CSV = 'WPP2019_PopulationByAgeSex_Medium.csv'
  _LST_NM: old country/region name list/CSV = 'cr_list.txt'
  _LST_CC_NM: new country/region name + locID list/CSV = 'cr_code_list.csv'
  _CSV_COUNTRY: downloaded country, continent list/CSV = 'country-continent-codes.csv'
  _CSV_MOD_CC: new country, continent list/CSV with valid names from _LST_CC_NM = 'mod_country-continent.csv'

  _BLD_DB_CC_CSV: build or rebuild revised country/locID CSV = False
  _BLD_MOD_CSV: build or rebuild modified country-continent CSV = False
  _DO_SAMPLING: do a sampling run/test = True

  _S_SIZE: the number of countries to randomly select for each sample, default 5
  _REPEATS: the number of times to repeat our complete process (random countries, calculations, etc) before
            we look to see what we have determined -- if anything, default 100

  _LABELS: a list of the label for each bin in our country data
  _L_LOWS: a lits of the lower boundaries for the above labels (the quartile calculation function needs those)

  use_yr: the year for which to estimate average age, default 2011
  n_tst: which test to run, default = 1
  t_size: actual sample size to use for test, default = _S_Size
  t_rpt: actual number of repeitions to use for test, default = _REPEATS
  t_stat: if applicable, which sample statistic is to be displayed = 1

Functions:

  get_sample(r_nbr): get data for r_nbr countries, generate and return sample statistics
  plot_hist(ax, h_data, s_yr, s_type, c_size, s_rep): generate est avg age histogram in specified Axes using supplied data
  t_conf_int(s_size, s_sdev, p_interval=95, tails=2)
  get_single_sample(r_nbr)
  display_stats_w_rpts()
  display_stats_1_sample()
  plot_hist_basic()

"""
import csv
import numpy as np
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statistics
import time
import argparse

# Pylance: disable=reportMissingImports
#import matplotlib.pyplot as plt
#from scipy import stats
from chart import chart
from chart import descriptive_stats as dstat
from database import population as pdb
from database import rc_names as rcn

#import math

# Test for some of the most likely starting working directories.
# If found set DATA_DIR accordingly. Using pathlib to cover operating system issues.
# If not one of them print message and exit.
D_PATH = ''
P_PATH = ''
p = pathlib.Path.cwd()
print(p)
if p.name == 'py_play':
  D_PATH = pathlib.Path('./data')
  P_PATH = pathlib.Path('./population/play')
elif p.name == 'population':
  D_PATH = pathlib.Path('../data')
  P_PATH = pathlib.Path('./play')
elif p.name == 'database':
  D_PATH = pathlib.Path('../../data')
else:
  print(f"\nCurrent working directory is an unexpected location, {p}!")
  print(f"Please run the application from the root directory. Quitting application")
  exit(1)

# module constants
_CSV_NM = 'WPP2019_PopulationByAgeSex_Medium.csv'
_LST_NM = 'cr_list.txt'
_LST_CC_NM = 'cr_code_list.csv'
_CSV_COUNTRY = 'country-continent-codes.csv'
_CSV_MOD_CC = 'mod_country-continent.csv'

_BLD_DB_CC_CSV = False
_BLD_MOD_CSV = False
_DO_SAMPLING = True

_S_SIZE = 5
_REPEATS = 100

_LABELS = chart.get_agrp_lbls()
_L_LOWS = dstat.get_grp_low_bdrys(_LABELS)
_L_MIDS = dstat.get_grp_middle(_LABELS)

# define some module level, empty/default variables

df_country = pd.DataFrame(columns = ['continent','country','valid'])
valid_country = []
c_data  = {}
n_tst = 1
t_size = _S_SIZE
t_rpt = _REPEATS
t_stat = 0
use_yr = '2011'
w_median = 0
e7_f_nbr = '0'

def get_sample(r_nbr):
  global df_country, c_data
  s_medians = {}
  mean_ms = 0
  wt_mean_ms = 0
  sum_median = 0
  c_pops = []
  sum_pop = [0 for _ in range(21)]
  c_tot = {}
  
  rand_c = df_country.sample(r_nbr)
  for _, row in rand_c.iterrows():
    c_nm = row['valid']
    s_data = c_data[c_nm]
    c_pops.append(s_data)
    c_tot[c_nm] = sum(s_data)
    s_median = dstat.get_wt_quartile(_L_LOWS, c_data[c_nm])
    s_medians[c_nm] = s_median

  mean_ms = sum(s_medians.values()) / r_nbr
  wt_medians = [s_medians[nm] * c_tot[nm] for nm in s_medians.keys()]
  wt_mean_ms = sum(wt_medians) / sum(c_tot.values())
  sum_pop = [sum(n) for n in zip(*c_pops)]
  sum_median = dstat.get_wt_quartile(_L_LOWS, sum_pop)
  #sum_median = dstat.get_wt_mean(_L_MIDS, sum_pop)

  return (mean_ms, wt_mean_ms, sum_median)


def plot_hist(ax, h_data, s_yr, s_type, c_size, s_rep):
  ax.set_xlabel(f'Age ({s_type})')
  ax.set_title(f"Simulation: {c_size} countries X {s_rep} repetitions")
  n, bins, patches = ax.hist(h_data, bins=range(0, 60))

  s_mean = statistics.mean(h_data)
  s_stdev = statistics.stdev(h_data)
  std_err = 1.96 * s_stdev / (s_rep**0.5)
  mean_p95 = s_mean + std_err
  mean_m95 = s_mean - std_err
  terr = t_conf_int(c_size, s_stdev)
  t_p95 = s_mean + terr
  t_m95 = s_mean - terr

  ax.axvline(s_mean, 0, 1, color='r', label=f'Est. Average Age: {s_mean:.2f}')
  ax.axvline(mean_m95, 0, 1, color='g', ls=':', label=f'Z 95% conf interval: {mean_m95:.2f}')
  ax.axvline(mean_p95, 0, 1, color='m', ls=':', label=f'Z 95% conf interval: {mean_p95:.2f}')
  ax.axvline(t_m95, 0, 1, color='c', label=f't 95% conf interval: {t_m95:.2f}')
  ax.axvline(t_p95, 0, 1, color='y', label=f't 95% conf interval: {t_p95:.2f}')
  # not actually plotting anything as such, just want value to show in legend area
  ax.axvline(s_stdev, 0, 0, color='k', label=f'std dev: {s_stdev:.2f}')
  # ax.axvline(w_median, 0, 0, color='k', label=f'World Average: {w_median:.2f}')
  ax.legend()


def t_conf_int(s_size, s_sdev, p_interval=95, tails=2):
  # defualt 95% confidence interval for 2 tail sample
  t = stats.t.ppf(1- ((100-p_interval)/tails/100), s_size-1)
  d_int = t * s_sdev / (s_size**0.5)
  #print(f"t_conf_int({s_size}, {s_sdev}) = {t} * {s_sdev} / {s_size**0.5} = {d_int}") 
  return d_int


def get_single_sample(r_nbr, df_seed=0):
  global df_country, c_data
  c_medians = {}
  c_totals = {}
  wt_mean = 0
  sum_median = 0
  c_pops = []
  sum_pop = [0 for _ in range(21)]
  
  if df_seed:
    rand_c = df_country.sample(r_nbr, random_state=df_seed)
  else:
    rand_c = df_country.sample(r_nbr)
  for _, row in rand_c.iterrows():
    c_nm = row['valid']
    s_data = c_data[c_nm]
    c_pops.append(s_data)
    c_totals[c_nm] = sum(s_data)
    c_median = dstat.get_wt_quartile(_L_LOWS, s_data)
    c_medians[c_nm] = c_median

  wt_medians = [c_medians[nm] * c_totals[nm] for nm in c_medians.keys()]
  wt_mean = sum(wt_medians) / sum(c_totals.values())
  sum_pop = [sum(n) for n in zip(*c_pops)]
  sum_median = dstat.get_wt_quartile(_L_LOWS, sum_pop)

  return c_medians, c_totals, wt_mean, sum_median


def display_stats_w_rpts():
  print(f"Random sample of {t_size} countries, repeated {t_rpt} times")

  print(f"\n{'':14} | {'Avg':>5} | {'std dev'} | {'norm 95% conf':13} | {'t 95% conf':13}")
  print(f"{'':14} | {'':5} | {'':7} | n={t_rpt:<11} | n={t_size:<11}")
  print(f"\n{'Simple mean:':14} | {mean_means:.2f} | {sd_means:7.2f} | {means_m95:.2f} - {means_p95:.2f} | {means_t_m95:.2f} - {means_t_p95:.2f}")
  print(f"{'Weighted mean:':14} | {mean_wtd:.2f} | {sd_wtd:7.2f} | {wtd_m95:.2f} - {wtd_p95:.2f} | {wtd_t_m95:.2f} - {wtd_t_p95:.2f}")
  print(f"{'Summed median:':14} | {mean_sumd:.2f} | {sd_sumd:7.2f} | {sumd_m95:.2f} - {sumd_p95:.2f} | {sumd_t_m95:.2f} - {sumd_t_p95:.2f}")
  print(f"{'World median:':14} | {w_median:.2f}")


def display_stats_1_sample():
  print(f"Random sample of {t_size} countries, one time")

  print(f"\n{'':14} | {'Avg':>5} | {'std dev'} | {'std err':7} | {'t 95% conf':13}")
  print(f"{'':14} | {'':5} | {'':7} | {'':7} | n={t_size:<11}")
  print(f"\n{'Simple mean:':14} | {mean_means:.2f} | {sd_means:7.2f} | {terr_means:7.2f} | {means_t_m95:.2f} - {means_t_p95:.2f}")
  print(f"{'Weighted mean:':14} | {mean_wtd:.2f} | {sd_wtd:7.2f} | {terr_wtd:7.2f} | {wtd_t_m95:.2f} - {wtd_t_p95:.2f}")
  print(f"{'Summed median:':14} | {mean_sumd:.2f} | {sd_sumd:7.2f} | {terr_sumd:7.2f} | {sumd_t_m95:.2f} - {sumd_t_p95:.2f}")
  print(f"{'World median:':14} | {w_median:.2f}")


def plot_hist_basic(ax, h_data, **kwargs):
  # Defaults
  x_def = 'Age'
  y_def = 'Count'
  ttl_def = f"Simulation: {len(h_data)} countries, no repeats"
  mean_lbl_def = "Est. Average Age"
  do_lgnd = False

  # Optional keyworded arguments and derived values
  x_lbl = kwargs.get('x_lbl', x_def)
  y_lbl = kwargs.get('y_lbl', y_def)
  p_ttl = kwargs.get('title', ttl_def)
  s_mean = kwargs.get('mean', None)
  mean_lgnd = kwargs.get('m_lbl', mean_lbl_def)
  s_stdev = kwargs.get('stdev', None)
  s_std_err = kwargs.get('std_err', None)
  m95 = None
  p95 = None
  if s_mean and s_std_err:
    m95 = s_mean - s_std_err
    p95 = s_mean + s_std_err

  # now onto plotting
  ax.set_xlabel(x_lbl)
  ax.set_ylabel(y_lbl)
  ax.set_title(p_ttl)

  n, bins, patches = ax.hist(h_data, bins=range(0, 60))

  if s_mean:
    ax.axvline(s_mean, 0, 1, color='r', label=f'{mean_lgnd}: {s_mean:.2f}')
  if m95:
    ax.axvline(m95, 0, 1, color='hotpink', ls='-.', label=f'min 95% conf interval: {m95:.2f}')
  if p95:
    ax.axvline(p95, 0, 1, color='orange', ls='-.', label=f'max 95% conf interval: {p95:.2f}')
  # not actually plotting anything as such, just want value to show in legend area
  if s_stdev:
    ax.axvline(s_stdev, 0, 0, color='w', label=f'std dev: {s_stdev:.2f}')
  # not actually plotting anything as such, just want value to show in legend area
  if s_std_err:
    ax.axvline(s_std_err, 0, 0, color='w', label=f'std err: {s_std_err:.2f}')
  # ax.axvline(w_median, 0, 0, color='k', label=f'World Average: {w_median:.2f}')
  # display legend if appropriate
  if s_mean or s_stdev or s_std_err:
    ax.legend()


def all_stat_sng_sample(r_nbr, df_seed=0):
  if not df_seed:
    s_medians, s_totals, mean_wtd, mean_sumd = get_single_sample(r_nbr)
  else:
    s_medians, s_totals, mean_wtd, mean_sumd = get_single_sample(r_nbr, df_seed=df_seed)

  mean_means = statistics.mean(s_medians.values())
  sd_means = statistics.stdev(s_medians.values())
  terr_means = t_conf_int(r_nbr, sd_means)

  sd_wtd = np.sqrt(np.cov(list(s_medians.values()), aweights=list(s_totals.values())))
  terr_wtd = t_conf_int(r_nbr, sd_wtd)

  sd_sumd = sd_wtd
  terr_sumd = t_conf_int(r_nbr, sd_sumd)

  p_means = [mean_means, mean_wtd, mean_sumd]
  p_sds = [sd_means, sd_wtd, sd_sumd]
  p_ses = [terr_means, terr_wtd, terr_sumd]

  return s_medians, s_totals, p_means, p_sds, p_ses


if __name__ == '__main__':
  tst_desc = ['na',
              "Plot Histograms of 3 Types of Repeated Sample Data",
              "Plot 4 histograms showing sample statistic for various repetitions",
              "Plot 4 histograms showing sample statistic for various sample sizes",
              "Confidence Interval Variations",
              "Generate and plot a single sample of the specified size",
              "Generate and compare samples of varying sizes",
              "Compare repeated samples of a single size",
              "Build table from test data",
              # following should always be the last test
              "Plot Single Statistic, One Histogram"
  ]
  max_tst = len(tst_desc) - 1
  smpl_types =['Simple Mean of Sample Medians',
              'Weighted Mean of Sample Medians',
              'Median of Summed Sample Data'
  ]
  max_smpl = len(smpl_types)
  max_size = 50
  max_rpt = 10000
  # defaults

  # define and get command line arguments
  parser = argparse.ArgumentParser()
  # long name preceded by --, short by single -, get it as an integer so can use to access test data array
  parser.add_argument('--do_exp', '-e', type=int, help=f'Which experiment to run (1-{max_tst})')
  parser.add_argument('--c_cnt', '-c', type=int, help=f'Number of countries to use per randome sample (1-{max_size})')
  parser.add_argument('--rpts', '-r', type=int, help=f'How many sample repetitions to use for current run (1-{max_rpt})')
  parser.add_argument('--stat', '-s', type=int, help=f'Which sample statistic to display (1-{max_smpl}), if applicable')
  parser.add_argument('--year', '-y', help=f'Which year to use for sample run (1950-2100)')
  parser.add_argument('--e7_nbr', '-n', help=f'File number for experiment 7, used to create file name')
  parser.add_argument('--e7_multiples', '-m', type=int, help=f'number of times (multiples) to run experiment 7')

  args = parser.parse_args()

  if args.do_exp:
    if (args.do_exp >= 1 and args.do_exp <= max_tst):
      n_tst = args.do_exp
    else:
      _DO_SAMPLING = False
  if args.c_cnt and (args.c_cnt >= 1 and args.c_cnt <= max_size):
    t_size = args.c_cnt
  if args.rpts and (args.rpts >= 1 and args.rpts <= max_rpt):
    t_rpt = args.rpts
  if args.stat and (args.stat >= 1 and args.stat <= max_smpl):
    t_stat = args.stat - 1
  if args.year and (int(args.year) >= 1950 and int(args.year) <= 2100):
    use_yr = args.year
  if args.e7_nbr:
    e7_f_nbr = args.e7_nbr

  #print(f"n_tst: {n_tst}, t_size: {t_size}, t_rpt: {t_rpt}, t_stat: {t_stat}, use_yr: {use_yr}")

  if _BLD_DB_CC_CSV:
    s_tm = time.time()
    rcn.make_db_name_id_csv()
    e_tm = time.time()
    print(f"building {_LST_CC_NM} took {e_tm - s_tm}")
  
  if _BLD_MOD_CSV:
    s_tm = time.time()
    rcn.make_mod_country_csv()
    e_tm = time.time()
    diff_tm = e_tm - s_tm
    print(f"modifying and saving dataframe took {diff_tm:.2f}")

  if _DO_SAMPLING:

    lst_means = []
    lst_wt_means = []
    lst_summed_mdns = []
    s_medians = {}
    s_totals = {}
    sumd_mds = 0
    mean_means, sd_means, serr_means, terr_means = (0, 0, 0, 0)
    mean_wtd, sd_wtd, serr_wtd, terr_wtd = (0, 0, 0, 0)
    mean_sumd, sd_sumd, serr_sumd, terr_sumd = (0, 0, 0, 0)
    h_lists = [[], [], []]

    sd_tm = time.time()
    
    df_country = pd.read_csv(D_PATH / _CSV_MOD_CC, comment='#', usecols = ['continent','country','valid'])

    valid_c = df_country['valid'].tolist()
    valid_c.append("World")
    valid_c.sort()

    c_data = pdb.get_crs_1yr_all(valid_c, use_yr)

    ed_tm = time.time()
    diff_tm = ed_tm - sd_tm
    print(f"loading CSV and getting data from population database took {diff_tm:.2f} ({sd_tm:.6f}-{ed_tm:.6f})")

    # get world median age for current year
    w_median = dstat.get_wt_quartile(_L_LOWS, c_data['World'])
    #w_median = dstat.get_wt_mean(_L_MIDS, c_data['World'])

    if t_rpt == 1:
      s_medians, s_totals, mean_wtd, mean_sumd = get_single_sample(t_size)

      mean_means = statistics.mean(s_medians.values())
      sd_means = statistics.stdev(s_medians.values())
      serr_means = 0
      terr_means = t_conf_int(t_size, sd_means)

      sd_wtd = np.sqrt(np.cov(list(s_medians.values()), aweights=list(s_totals.values())))
      serr_wtd = 0
      terr_wtd = t_conf_int(t_size, sd_wtd)

      sd_sumd = sd_wtd
      serr_sumd = 0
      terr_sumd = t_conf_int(t_size, sd_sumd)
    else:   
      for _ in range(t_rpt):
        mean_mds, wt_mean_mds, summed_md = get_sample(t_size)
        #print(mean_mds, wt_mean_mds, summed_md)
        lst_means.append(mean_mds)
        lst_wt_means.append(wt_mean_mds)
        lst_summed_mdns.append(summed_md)

      mean_means = statistics.mean(lst_means)
      sd_means = statistics.stdev(lst_means)
      serr_means = 1.96 * sd_means / (t_rpt**0.5)
      terr_means = t_conf_int(t_size, sd_means)
  
      mean_wtd = statistics.mean(lst_wt_means)
      sd_wtd = statistics.stdev(lst_wt_means)
      serr_wtd = 1.96 * sd_wtd / (t_rpt**0.5)
      terr_wtd = t_conf_int(t_size, sd_wtd)
  
      mean_sumd = statistics.mean(lst_summed_mdns)
      sd_sumd = statistics.stdev(lst_summed_mdns)
      serr_sumd = 1.96 * sd_sumd / (t_rpt**0.5)
      terr_sumd = t_conf_int(t_size, sd_sumd)

      h_lists = [lst_means, lst_wt_means, lst_summed_mdns]

    # calc 95% confidence intervals
    means_p95 = mean_means + serr_means
    means_m95 = mean_means - serr_means
    means_t_p95 = mean_means + terr_means
    means_t_m95 = mean_means - terr_means

    wtd_p95 = mean_wtd + serr_wtd
    wtd_m95 = mean_wtd - serr_wtd
    wtd_t_p95 = mean_wtd + terr_wtd
    wtd_t_m95 = mean_wtd - terr_wtd

    sumd_p95 = mean_sumd + serr_sumd
    sumd_m95 = mean_sumd - serr_sumd
    sumd_t_p95 = mean_sumd + terr_sumd
    sumd_t_m95 = mean_sumd - terr_sumd
    
    es_tm = time.time()
    diff_tm = es_tm - ed_tm
    print(f"\ngenerating sample stats took {diff_tm:.2f} ({ed_tm:.6f}-{es_tm:.6f})")

    print(f"\nTest #{n_tst}: {tst_desc[n_tst]}\n")

    # display appropriate data in terminal window
    if n_tst != 6:
      if t_rpt == 1:
        display_stats_1_sample()
      else:
        display_stats_w_rpts()

    if n_tst == 1:
      fig, axs = plt.subplots(2, 2, sharey=True, gridspec_kw={'hspace': .3}, figsize=(15,8))
      axs[1, 1].axis('off')
      fig.suptitle(f"Estimated World Average Age: {use_yr}")

      p_row = 0
      for i in range(len(h_lists)):
        plot_hist(axs[p_row, i % 2], h_lists[i], use_yr, smpl_types[i], t_size, t_rpt)
        if i > 0 and i % 2 == 1:
          p_row += 1

    elif n_tst == 2:
      pass

    elif n_tst == 3:
      pass

    elif n_tst == 4:
      z_95s = [(means_m95, means_p95, mean_means), (wtd_m95, wtd_p95, mean_wtd), (sumd_m95, sumd_p95, mean_sumd)]
      t_95s = [(means_t_m95, means_t_p95, mean_means), (wtd_t_m95, wtd_t_p95, mean_wtd), (sumd_t_m95, sumd_t_p95, mean_sumd)]
      for i in range(len(smpl_types)):
        in_z_95_low = 0
        in_z_95_high = 0
        in_t_95_low = 0
        in_t_95_high = 0
        for mm in h_lists[i]:
          if mm >= z_95s[i][0] and mm <= z_95s[i][1]:
            if mm < z_95s[i][2]:
              in_z_95_low += 1
            else:
              in_z_95_high += 1
          if mm >= t_95s[i][0] and mm <= t_95s[i][1]:
            if mm < z_95s[i][2]:
              in_t_95_low += 1
            else:
              in_t_95_high += 1
        print(f"\n{(in_z_95_low + in_z_95_high)/t_rpt*100:.2f}% of samples fall within Z 95% confidence interval for {smpl_types[i]}")
        print(f"\t{in_z_95_low} below the mean and {in_z_95_high} above")
        print(f"{(in_t_95_low + in_t_95_high)/t_rpt*100:.2f}% of samples fall within t 95% confidence interval for {smpl_types[i]}")
        print(f"\t{in_t_95_low} below the mean and {in_t_95_high} above")

    elif n_tst == 5:
      fig, axs = plt.subplots(2, 2, figsize=(15,9), sharey=True)
      # x_lbl = kwargs.get('x_lbl', x_def)
      # p_ttl = kwargs.get('title', ttl_def)
      # s_mean = kwargs.get('mean', None)
      # s_stdev = kwargs.get('stdev', None)
      # s_std_err = kwargs.get('std_err', None)
      fig.suptitle(f"Estimated World Average Age for {use_yr} ({t_size} countries, one time)")

      kargs = {
        'title': '',
        'x_lbl': f'Average Age (Median Age)',
        'mean': mean_means,
        'm_lbl': "Est. Average Age (simple mean)",
        'stdev': sd_means,
        'std_err': terr_means
        }
      plot_hist_basic(axs[0, 0], list(s_medians.values()), **kargs)

      kargs = {
        'title': '',
        'x_lbl': f'Average Age (Median Age)',
        'mean': mean_wtd,
        'm_lbl': "Est. Average Age (weighted mean)",
        'stdev': sd_wtd,
        'std_err': terr_wtd
        }
      plot_hist_basic(axs[0, 1], list(s_medians.values()), **kargs)

      kargs = {
        'title': '',
        'x_lbl': f'Average Age (Median Age)',
        'mean': mean_sumd,
        'm_lbl': "Est. Average Age (median of sum)",
        'stdev': sd_sumd,
        'std_err': terr_sumd
        }
      plot_hist_basic(axs[1, 0], [mean_sumd], **kargs)

    elif n_tst == 6:
      #print(f"t_stat = {t_stat}")
      fig, axs = plt.subplots(2, 2, figsize=(15,12), sharey=True)
      fig.suptitle(f"Estimated World Average Age for {use_yr} (various sample sizes, one time)")
      ttl_stat = ['simple mean', 'weighted mean', 'median of sum']

      print(f"Random sample of varying numbers of countries, {ttl_stat[t_stat]}, one time")

      print(f"\n{'Sample Size':14} | {'Avg':>5} | {'std dev'} | {'std err':7} | {'t 95% conf':13}")
      print(f"{'':14} | {'':5} | {'':7} | {'':7} | {'':<11}")

      r_nbr = 0
      for i in range(4):
        if i > 0 and i % 2 == 0:
          r_nbr += 1
        c_nbr = t_size + (i * 10)

        p_medians, p_tots, p_mean, p_sd, p_se = all_stat_sng_sample(c_nbr)

        m95 = p_mean[t_stat] - p_se[t_stat]
        p95 = p_mean[t_stat] + p_se[t_stat]
        int_95 = f"{m95:.2f} - {p95:.2f}"

        print(f"{c_nbr:14} | {p_mean[t_stat]:5.2f} | {p_sd[t_stat]:7.2f} | {p_se[t_stat]:7.2f} | {int_95}")

        kargs = {
          'title': f'Sample of {len(p_medians)} countries',
          'x_lbl': f'Average Age (Median Age)',
          'mean': p_mean[t_stat],
          'm_lbl': f"Est. Average Age ({ttl_stat[t_stat]})",
          'stdev': p_sd[t_stat],
          'std_err': p_se[t_stat]
          }

        if t_stat == 0:
          plot_hist_basic(axs[r_nbr, i % 2], list(p_medians.values()), **kargs)
        elif t_stat == 1:
          plot_hist_basic(axs[r_nbr, i % 2], list(p_medians.values()), **kargs)
        elif t_stat == 2:
          plot_hist_basic(axs[r_nbr, i % 2], [p_mean[t_stat]], **kargs)

      print(f"{'World median:':14} | {w_median:.2f}")

    elif n_tst == 7:
      # https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.bar.html
      # https://matplotlib.org/3.1.0/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
      t7_data_file = f't7_samples_{e7_f_nbr}.txt'
      t7_rpt = 10
      t7_size = 33
      t7_runs = 1
      t7_stat = 0
      t7_means = []
      t7_stderrs = []

      t7_seed = 349

      if args.stat and (args.stat >= 1 and args.stat <= max_smpl):
        t7_stat = args.stat - 1
      if args.rpts and (args.rpts >= 1 and args.rpts <= max_rpt):
        t7_rpt = args.rpts
      if args.c_cnt and (args.c_cnt >= 1 and args.c_cnt <= max_size):
        t7_size = args.c_cnt
      if args.e7_multiples and (args.e7_multiples >= 1 and args.e7_multiples <= 20):
        t7_runs = args.e7_multiples

      def print_tbl(s_means, s_serrs):
        print(f"Sample means +- std err for {t7_rpt} samples of size {t7_size}")
        for i, s_mean in enumerate(s_means):
          if i > 0:
            print(" | ", end='')
          print(f"{s_mean:.1f} +- {s_serrs[i]:.1f}", end='')
        print()  

      missed_interval = 0
      missed_1st_95 = 0

      tmp_seed = t7_seed
      for rnbr in range(1, t7_runs + 1):
        tmp_seed += ((rnbr - 1) * 10031)
        t7_means = []
        t7_stderrs = []
        for i in range(t7_rpt):
          tmp_seed += (i * 13)
          p_medians, p_tots, p_mean, p_sd, p_se = all_stat_sng_sample(t7_size, df_seed=tmp_seed)
          t7_means.append(p_mean[t7_stat])
          t7_stderrs.append(p_se[t7_stat])

        missed_interval = 0
        missed_1st_95 = 0
        fst_m95 = t7_means[0] - t7_stderrs[0]
        fst_p95 = t7_means[0] + t7_stderrs[0]
        for i, mean in enumerate(t7_means):
          if w_median < mean - t7_stderrs[i] or w_median > mean + t7_stderrs[i]:
            missed_interval += 1
          if mean < fst_m95 or mean > fst_p95:
            missed_1st_95 += 1

        print_tbl(t7_means, t7_stderrs)
        with open(P_PATH / t7_data_file, 'a') as fl_dbug:
          fl_dbug.write(f"'{rnbr}': {{\n")
          fl_dbug.write(f"  'means': {t7_means},\n")
          fl_dbug.write(f"  'stderrs': {t7_stderrs},\n")
          fl_dbug.write(f"  'missed': ({missed_interval}, {missed_1st_95})\n")
          fl_dbug.write(f"     }},\n")

      ind = np.arange(len(t7_means))  # the x locations for the groups
      width = 0.7  # the width of the bars

      f_wd = 10
      f_ht = 6
      if t7_rpt > 19:
        f_wd = 15
        f_ht = 9
      elif t7_rpt > 34:
        f_wd = 20
        f_ht = 12
      
      fig, ax = plt.subplots(figsize=(f_wd,f_ht))
      rects1 = ax.bar(ind, t7_means, width, yerr=t7_stderrs)

      # Add some text for labels, title and custom x-axis tick labels, etc.
      ax.set_ylabel(f'Est. Average Age ({smpl_types[t7_stat]})')
      ax.set_title(f'Estimated Average Age from {t7_rpt} samples of size {t7_size}')
      ax.set_xticks(ind)

      m95_1 = t7_means[t7_stat] - t7_stderrs[t7_stat]
      p95_1 = t7_means[t7_stat] + t7_stderrs[t7_stat]
      ax.axhspan(m95_1, p95_1, facecolor='green', alpha=0.2)
      ax.axhline(w_median, color='r')
      ax.axvline(missed_interval, 0, 0, color='w', label=f'world avg not in conf interval: {missed_interval}')
      #ax.axvline(missed_1st_95, 0, 0, color='w', label=f'sample means not in first samples 95% conf: {missed_1st_95}')
      ax.legend()

      def autolabel(rects, xpos='center'):
        """
        Attach a text label above each bar in *rects*, displaying its height.

        *xpos* indicates which side to place the text w.r.t. the center of
        the bar. It can be one of the following {'center', 'right', 'left'}.
        """

        ha = {'center': 'center', 'right': 'left', 'left': 'right'}
        offset = {'center': 0, 'right': 1, 'left': -1}

        for rect in rects:
          height = rect.get_height()
          ax.annotate(f'{height:.1f}',
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(offset[xpos]*3, 3),  # use 3 points offset
                        textcoords="offset points",  # in both directions
                        ha=ha[xpos], va='bottom')

      autolabel(rects1, "left")

    elif n_tst == 8:
      import t7_samples_3_tst as t7sr
      #print(t7sr.sample_runs)

      table = []
      table.append('Sample    |')
      for i in range(len(t7sr.sample_runs['1']['means'])):
        table.append(f'Sample {(i + 1):2} | ')
      for i in range(len(t7sr.sample_runs['1']['means'])):
        pass
      print("\n".join(table))


    elif n_tst == max_tst:
      fig, ax = plt.subplots(figsize=(10,6))
      fig.suptitle(f"Estimated World Average Age: {use_yr}")
      plot_hist(ax, h_lists[t_stat], use_yr, smpl_types[t_stat], t_size, t_rpt)

    es_tm = time.time()
    diff_tm = es_tm - sd_tm
    print(f"\nwhole sampling process took {diff_tm:.2f}")

    plt.show()

t7_build_table.py

This one not likely of much interest to you, but just in case…

"""Code for generating HTML tables and/or plots for posts world_avg_age_*/*.
   Uses data generated by experiment #7 in est_avg_age.py
"""

import sys
from pathlib import Path
import argparse
import matplotlib.pyplot as plt

file = Path(__file__).resolve()
parent, root = file.parent, file.parents[0]
#print(file.parents[0])
#print(f"parent: {parent}; root: {root}\n")
sys.path.append(str(root))
# Additionally remove the current file's directory from sys.path
try:
  sys.path.remove(str(parent))
except ValueError: # Already removed
  pass

min_tst = 2
max_tst = 5
do_tst = 2

# define and get command line arguments
parser = argparse.ArgumentParser()
# long name preceded by --, short by single -, get it as an integer so can use to access test data array
parser.add_argument('--do_tst', '-t', type=int, help=f'Which test run to use ({min_tst}-{max_tst})')

args = parser.parse_args()

if args.do_tst:
  if (args.do_tst >= min_tst and args.do_tst <= max_tst):
    do_tst = args.do_tst
  else:
    print(f"Number for test run, {args.do_tst}, not valid, should be {min_tst}-{max_tst}")
  
import t7_samples_2_tst as t7ts

if do_tst == 1:
  exit(1)
elif do_tst == 2:  
  import t7_samples_2_tst as t7ts
elif do_tst == 3:
  import t7_samples_3_tst as t7ts
elif do_tst == 4:
  import t7_samples_4_tst as t7ts
elif do_tst == 5:
  print(f"importing t7_samples_5_tst")
  import t7_samples_5_tst as t7ts

if True:
  #print(t7ts.sample_runs)
  row_tbl = []
  t_out = f"r:/learn/py_play/population/play/t7_tbl_out_{do_tst}.html"

  with open(t_out, 'w') as tbl_out:
    tbl_out.write('<table>\n')
    row_tbl.append("<tr><th>&nbsp;</th>")
    
    nbr_samples = len(t7ts.sample_runs['1']['means'])
    l_runs = list(t7ts.sample_runs.keys())

    for i in range(nbr_samples):
      s_mean = f"{t7ts.sample_runs['1']['means'][i]:.1f}"
      s_err = f"{t7ts.sample_runs['1']['stderrs'][i]:.1f}"
      row_tbl.append(f"<tr><th>Sample {i + 1}</th>")
      #row_tbl.append(f"<tr><th>Sample {i + 1}:</th><td>{s_mean} &plusmn;{s_err}</td>")
    row_tbl.append("<tr><th>Run Mean</th>")  
    row_tbl.append("<tr><th>True Missed</th>")
    row_tbl.append("<tr><th>In 1st CI</th>")
    
    for run, data in t7ts.sample_runs.items():
      row_tbl[0] += f"<th>Run {run}</th>"
      for i in range(nbr_samples):
        s_mean = f"{t7ts.sample_runs[run]['means'][i]:.1f}"
        s_err = f"{t7ts.sample_runs[run]['stderrs'][i]:.1f}"
        row_tbl[i + 1] += f"<td>{s_mean} &plusmn;{s_err}</td>"
      avg_means = sum(t7ts.sample_runs[run]['means']) / nbr_samples
      row_tbl[nbr_samples + 1] += f"<td>{avg_means:.1f}</td>"
      row_tbl[nbr_samples + 2] += f"<td>{t7ts.sample_runs[run]['missed'][0]:>1}</td>"
      row_tbl[nbr_samples + 3] += f"<td>{t7ts.sample_runs[run]['missed'][1]:>1}</td>"

    for row in row_tbl:
      tbl_out.write(f"{row}</tr>\n")

    tbl_out.write("</table>\n")

# plot histogram of all samples

def plot_hist_basic(ax, h_data, **kwargs):
  # Defaults
  x_def = 'Age'
  y_def = 'Count'
  ttl_def = f"Simulation: {len(h_data)} countries, no repeats"
  mean_lbl_def = "Est. Average Age"
  rng_min = 22
  rng_max = 34
  nbr_bin = 48
  do_lgnd = False

  # Optional keyworded arguments and derived values
  x_lbl = kwargs.get('x_lbl', x_def)
  y_lbl = kwargs.get('y_lbl', y_def)
  p_ttl = kwargs.get('title', ttl_def)
  s_mean = kwargs.get('mean', None)
  mean_lgnd = kwargs.get('m_lbl', mean_lbl_def)
  s_stdev = kwargs.get('stdev', None)
  s_std_err = kwargs.get('std_err', None)
  m95 = None
  p95 = None
  if s_mean and s_std_err:
    m95 = s_mean - s_std_err
    p95 = s_mean + s_std_err
  pop_mean = kwargs.get('pop_mean')
  
  # now onto plotting
  ax.set_xlabel(x_lbl)
  ax.set_ylabel(y_lbl)
  ax.set_title(p_ttl)

  n, bins, patches = ax.hist(h_data, bins=nbr_bin, range=(rng_min, rng_max))

  if s_mean:
    ax.axvline(s_mean, 0, 1, color='r', label=f'{mean_lgnd}: {s_mean:.2f}')
  if m95:
    ax.axvline(m95, 0, 1, color='hotpink', ls='-.', label=f'min 95% conf interval: {m95:.2f}')
  if p95:
    ax.axvline(p95, 0, 1, color='orange', ls='-.', label=f'max 95% conf interval: {p95:.2f}')
  # not actually plotting anything as such, just want value to show in legend area
  if s_stdev:
    ax.axvline(s_stdev, 0, 0, color='w', label=f'std dev: {s_stdev:.2f}')
  # not actually plotting anything as such, just want value to show in legend area
  if s_std_err:
    ax.axvline(s_std_err, 0, 0, color='w', label=f'std err: {s_std_err:.2f}')
  # ax.axvline(w_median, 0, 0, color='k', label=f'World Average: {w_median:.2f}')
  # display legend if appropriate
  if pop_mean:
    ax.axvline(pop_mean, 0, 1, color='aqua', label=f'Population average: {pop_mean:.2f}')
  if s_mean or s_stdev or s_std_err:
    ax.legend()


all_smpl = []
for run, data in t7ts.sample_runs.items():
  all_smpl += t7ts.sample_runs[run]['means']
nbr_smpl = len(all_smpl)
#print(all_smpl)

#fig, axs = plt.subplots(2, 2, figsize=(15,9), sharey=True)
f_wd = 10
f_ht = 6
fig, ax = plt.subplots(figsize=(f_wd,f_ht))
# x_lbl = kwargs.get('x_lbl', x_def)
# p_ttl = kwargs.get('title', ttl_def)
# s_mean = kwargs.get('mean', None)
# s_stdev = kwargs.get('stdev', None)
# s_std_err = kwargs.get('std_err', None)
fig.suptitle(f"Estimated World Average Age of {nbr_smpl} Samples")
world_avg = 28.68
smpl_mean = sum(all_smpl) / nbr_smpl
rnd_mean = f"{smpl_mean:.2f}"
plt_mean = float(rnd_mean)

kargs = {
  'title': '',
  'mean': plt_mean,
  'm_lbl': 'Mean of all samples:',
  'pop_mean': world_avg
  }
plot_hist_basic(ax, all_smpl, **kargs)

plt.show()