As promised in Estimate World Average Age: Part 5 here’s the current state of the code in my module population/est_avg_age.py. Most of the changes since the last posting cover experiments 6 & 7.
I have also included the code for my blog related module, population/play/t7_build_table.py. Just in case you are interested.
And, I once again I do apologize at how messy the est_avg_age.py module’s code is getting. Now, I expect I should in some fashion refactor the whole thing and I may, in future, do so. But for now, it is what it is. Too much going on in my life at the moment. Just struggling to get the posts written and the code working. And, yes the module documentation has not been kept up, or even attempted for most of the function definitions.
The code also contains numerous commented out debug statements (mostly print() statements). I was too lazy to got through the listing and remove them all. Sorry!
est_avg_age.py
"""Code for estimating the world's average (median) age by random sampling.
Tied to blog post, world_avg_age_*/*.
This is version 2, needed to rethink how I was doing things due to the time my sample generation
could take. It is based on play/est_avg_age.test.py
Planning to try and do it all in memory rather than repeated i/o.
Will be repeating code in a lot of other modules, but, no way around that while I do my testing
Actually !! going to generate a new csv file that has country and mod continent column from the latest csv file
plus a new valid country name column that has the proper name from the population database csv. Will
use that csv to generate random names for simulation.
File: py_play/population/est_avg_age.py
Module level and/or global variables:
----
_CSV_NM: database population data CSV = 'WPP2019_PopulationByAgeSex_Medium.csv'
_LST_NM: old country/region name list/CSV = 'cr_list.txt'
_LST_CC_NM: new country/region name + locID list/CSV = 'cr_code_list.csv'
_CSV_COUNTRY: downloaded country, continent list/CSV = 'country-continent-codes.csv'
_CSV_MOD_CC: new country, continent list/CSV with valid names from _LST_CC_NM = 'mod_country-continent.csv'
_BLD_DB_CC_CSV: build or rebuild revised country/locID CSV = False
_BLD_MOD_CSV: build or rebuild modified country-continent CSV = False
_DO_SAMPLING: do a sampling run/test = True
_S_SIZE: the number of countries to randomly select for each sample, default 5
_REPEATS: the number of times to repeat our complete process (random countries, calculations, etc) before
we look to see what we have determined -- if anything, default 100
_LABELS: a list of the label for each bin in our country data
_L_LOWS: a lits of the lower boundaries for the above labels (the quartile calculation function needs those)
use_yr: the year for which to estimate average age, default 2011
n_tst: which test to run, default = 1
t_size: actual sample size to use for test, default = _S_Size
t_rpt: actual number of repeitions to use for test, default = _REPEATS
t_stat: if applicable, which sample statistic is to be displayed = 1
Functions:
get_sample(r_nbr): get data for r_nbr countries, generate and return sample statistics
plot_hist(ax, h_data, s_yr, s_type, c_size, s_rep): generate est avg age histogram in specified Axes using supplied data
t_conf_int(s_size, s_sdev, p_interval=95, tails=2)
get_single_sample(r_nbr)
display_stats_w_rpts()
display_stats_1_sample()
plot_hist_basic()
"""
import csv
import numpy as np
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statistics
import time
import argparse
# Pylance: disable=reportMissingImports
#import matplotlib.pyplot as plt
#from scipy import stats
from chart import chart
from chart import descriptive_stats as dstat
from database import population as pdb
from database import rc_names as rcn
#import math
# Test for some of the most likely starting working directories.
# If found set DATA_DIR accordingly. Using pathlib to cover operating system issues.
# If not one of them print message and exit.
D_PATH = ''
P_PATH = ''
p = pathlib.Path.cwd()
print(p)
if p.name == 'py_play':
D_PATH = pathlib.Path('./data')
P_PATH = pathlib.Path('./population/play')
elif p.name == 'population':
D_PATH = pathlib.Path('../data')
P_PATH = pathlib.Path('./play')
elif p.name == 'database':
D_PATH = pathlib.Path('../../data')
else:
print(f"\nCurrent working directory is an unexpected location, {p}!")
print(f"Please run the application from the root directory. Quitting application")
exit(1)
# module constants
_CSV_NM = 'WPP2019_PopulationByAgeSex_Medium.csv'
_LST_NM = 'cr_list.txt'
_LST_CC_NM = 'cr_code_list.csv'
_CSV_COUNTRY = 'country-continent-codes.csv'
_CSV_MOD_CC = 'mod_country-continent.csv'
_BLD_DB_CC_CSV = False
_BLD_MOD_CSV = False
_DO_SAMPLING = True
_S_SIZE = 5
_REPEATS = 100
_LABELS = chart.get_agrp_lbls()
_L_LOWS = dstat.get_grp_low_bdrys(_LABELS)
_L_MIDS = dstat.get_grp_middle(_LABELS)
# define some module level, empty/default variables
df_country = pd.DataFrame(columns = ['continent','country','valid'])
valid_country = []
c_data = {}
n_tst = 1
t_size = _S_SIZE
t_rpt = _REPEATS
t_stat = 0
use_yr = '2011'
w_median = 0
e7_f_nbr = '0'
def get_sample(r_nbr):
global df_country, c_data
s_medians = {}
mean_ms = 0
wt_mean_ms = 0
sum_median = 0
c_pops = []
sum_pop = [0 for _ in range(21)]
c_tot = {}
rand_c = df_country.sample(r_nbr)
for _, row in rand_c.iterrows():
c_nm = row['valid']
s_data = c_data[c_nm]
c_pops.append(s_data)
c_tot[c_nm] = sum(s_data)
s_median = dstat.get_wt_quartile(_L_LOWS, c_data[c_nm])
s_medians[c_nm] = s_median
mean_ms = sum(s_medians.values()) / r_nbr
wt_medians = [s_medians[nm] * c_tot[nm] for nm in s_medians.keys()]
wt_mean_ms = sum(wt_medians) / sum(c_tot.values())
sum_pop = [sum(n) for n in zip(*c_pops)]
sum_median = dstat.get_wt_quartile(_L_LOWS, sum_pop)
#sum_median = dstat.get_wt_mean(_L_MIDS, sum_pop)
return (mean_ms, wt_mean_ms, sum_median)
def plot_hist(ax, h_data, s_yr, s_type, c_size, s_rep):
ax.set_xlabel(f'Age ({s_type})')
ax.set_title(f"Simulation: {c_size} countries X {s_rep} repetitions")
n, bins, patches = ax.hist(h_data, bins=range(0, 60))
s_mean = statistics.mean(h_data)
s_stdev = statistics.stdev(h_data)
std_err = 1.96 * s_stdev / (s_rep**0.5)
mean_p95 = s_mean + std_err
mean_m95 = s_mean - std_err
terr = t_conf_int(c_size, s_stdev)
t_p95 = s_mean + terr
t_m95 = s_mean - terr
ax.axvline(s_mean, 0, 1, color='r', label=f'Est. Average Age: {s_mean:.2f}')
ax.axvline(mean_m95, 0, 1, color='g', ls=':', label=f'Z 95% conf interval: {mean_m95:.2f}')
ax.axvline(mean_p95, 0, 1, color='m', ls=':', label=f'Z 95% conf interval: {mean_p95:.2f}')
ax.axvline(t_m95, 0, 1, color='c', label=f't 95% conf interval: {t_m95:.2f}')
ax.axvline(t_p95, 0, 1, color='y', label=f't 95% conf interval: {t_p95:.2f}')
# not actually plotting anything as such, just want value to show in legend area
ax.axvline(s_stdev, 0, 0, color='k', label=f'std dev: {s_stdev:.2f}')
# ax.axvline(w_median, 0, 0, color='k', label=f'World Average: {w_median:.2f}')
ax.legend()
def t_conf_int(s_size, s_sdev, p_interval=95, tails=2):
# defualt 95% confidence interval for 2 tail sample
t = stats.t.ppf(1- ((100-p_interval)/tails/100), s_size-1)
d_int = t * s_sdev / (s_size**0.5)
#print(f"t_conf_int({s_size}, {s_sdev}) = {t} * {s_sdev} / {s_size**0.5} = {d_int}")
return d_int
def get_single_sample(r_nbr, df_seed=0):
global df_country, c_data
c_medians = {}
c_totals = {}
wt_mean = 0
sum_median = 0
c_pops = []
sum_pop = [0 for _ in range(21)]
if df_seed:
rand_c = df_country.sample(r_nbr, random_state=df_seed)
else:
rand_c = df_country.sample(r_nbr)
for _, row in rand_c.iterrows():
c_nm = row['valid']
s_data = c_data[c_nm]
c_pops.append(s_data)
c_totals[c_nm] = sum(s_data)
c_median = dstat.get_wt_quartile(_L_LOWS, s_data)
c_medians[c_nm] = c_median
wt_medians = [c_medians[nm] * c_totals[nm] for nm in c_medians.keys()]
wt_mean = sum(wt_medians) / sum(c_totals.values())
sum_pop = [sum(n) for n in zip(*c_pops)]
sum_median = dstat.get_wt_quartile(_L_LOWS, sum_pop)
return c_medians, c_totals, wt_mean, sum_median
def display_stats_w_rpts():
print(f"Random sample of {t_size} countries, repeated {t_rpt} times")
print(f"\n{'':14} | {'Avg':>5} | {'std dev'} | {'norm 95% conf':13} | {'t 95% conf':13}")
print(f"{'':14} | {'':5} | {'':7} | n={t_rpt:<11} | n={t_size:<11}")
print(f"\n{'Simple mean:':14} | {mean_means:.2f} | {sd_means:7.2f} | {means_m95:.2f} - {means_p95:.2f} | {means_t_m95:.2f} - {means_t_p95:.2f}")
print(f"{'Weighted mean:':14} | {mean_wtd:.2f} | {sd_wtd:7.2f} | {wtd_m95:.2f} - {wtd_p95:.2f} | {wtd_t_m95:.2f} - {wtd_t_p95:.2f}")
print(f"{'Summed median:':14} | {mean_sumd:.2f} | {sd_sumd:7.2f} | {sumd_m95:.2f} - {sumd_p95:.2f} | {sumd_t_m95:.2f} - {sumd_t_p95:.2f}")
print(f"{'World median:':14} | {w_median:.2f}")
def display_stats_1_sample():
print(f"Random sample of {t_size} countries, one time")
print(f"\n{'':14} | {'Avg':>5} | {'std dev'} | {'std err':7} | {'t 95% conf':13}")
print(f"{'':14} | {'':5} | {'':7} | {'':7} | n={t_size:<11}")
print(f"\n{'Simple mean:':14} | {mean_means:.2f} | {sd_means:7.2f} | {terr_means:7.2f} | {means_t_m95:.2f} - {means_t_p95:.2f}")
print(f"{'Weighted mean:':14} | {mean_wtd:.2f} | {sd_wtd:7.2f} | {terr_wtd:7.2f} | {wtd_t_m95:.2f} - {wtd_t_p95:.2f}")
print(f"{'Summed median:':14} | {mean_sumd:.2f} | {sd_sumd:7.2f} | {terr_sumd:7.2f} | {sumd_t_m95:.2f} - {sumd_t_p95:.2f}")
print(f"{'World median:':14} | {w_median:.2f}")
def plot_hist_basic(ax, h_data, **kwargs):
# Defaults
x_def = 'Age'
y_def = 'Count'
ttl_def = f"Simulation: {len(h_data)} countries, no repeats"
mean_lbl_def = "Est. Average Age"
do_lgnd = False
# Optional keyworded arguments and derived values
x_lbl = kwargs.get('x_lbl', x_def)
y_lbl = kwargs.get('y_lbl', y_def)
p_ttl = kwargs.get('title', ttl_def)
s_mean = kwargs.get('mean', None)
mean_lgnd = kwargs.get('m_lbl', mean_lbl_def)
s_stdev = kwargs.get('stdev', None)
s_std_err = kwargs.get('std_err', None)
m95 = None
p95 = None
if s_mean and s_std_err:
m95 = s_mean - s_std_err
p95 = s_mean + s_std_err
# now onto plotting
ax.set_xlabel(x_lbl)
ax.set_ylabel(y_lbl)
ax.set_title(p_ttl)
n, bins, patches = ax.hist(h_data, bins=range(0, 60))
if s_mean:
ax.axvline(s_mean, 0, 1, color='r', label=f'{mean_lgnd}: {s_mean:.2f}')
if m95:
ax.axvline(m95, 0, 1, color='hotpink', ls='-.', label=f'min 95% conf interval: {m95:.2f}')
if p95:
ax.axvline(p95, 0, 1, color='orange', ls='-.', label=f'max 95% conf interval: {p95:.2f}')
# not actually plotting anything as such, just want value to show in legend area
if s_stdev:
ax.axvline(s_stdev, 0, 0, color='w', label=f'std dev: {s_stdev:.2f}')
# not actually plotting anything as such, just want value to show in legend area
if s_std_err:
ax.axvline(s_std_err, 0, 0, color='w', label=f'std err: {s_std_err:.2f}')
# ax.axvline(w_median, 0, 0, color='k', label=f'World Average: {w_median:.2f}')
# display legend if appropriate
if s_mean or s_stdev or s_std_err:
ax.legend()
def all_stat_sng_sample(r_nbr, df_seed=0):
if not df_seed:
s_medians, s_totals, mean_wtd, mean_sumd = get_single_sample(r_nbr)
else:
s_medians, s_totals, mean_wtd, mean_sumd = get_single_sample(r_nbr, df_seed=df_seed)
mean_means = statistics.mean(s_medians.values())
sd_means = statistics.stdev(s_medians.values())
terr_means = t_conf_int(r_nbr, sd_means)
sd_wtd = np.sqrt(np.cov(list(s_medians.values()), aweights=list(s_totals.values())))
terr_wtd = t_conf_int(r_nbr, sd_wtd)
sd_sumd = sd_wtd
terr_sumd = t_conf_int(r_nbr, sd_sumd)
p_means = [mean_means, mean_wtd, mean_sumd]
p_sds = [sd_means, sd_wtd, sd_sumd]
p_ses = [terr_means, terr_wtd, terr_sumd]
return s_medians, s_totals, p_means, p_sds, p_ses
if __name__ == '__main__':
tst_desc = ['na',
"Plot Histograms of 3 Types of Repeated Sample Data",
"Plot 4 histograms showing sample statistic for various repetitions",
"Plot 4 histograms showing sample statistic for various sample sizes",
"Confidence Interval Variations",
"Generate and plot a single sample of the specified size",
"Generate and compare samples of varying sizes",
"Compare repeated samples of a single size",
"Build table from test data",
# following should always be the last test
"Plot Single Statistic, One Histogram"
]
max_tst = len(tst_desc) - 1
smpl_types =['Simple Mean of Sample Medians',
'Weighted Mean of Sample Medians',
'Median of Summed Sample Data'
]
max_smpl = len(smpl_types)
max_size = 50
max_rpt = 10000
# defaults
# define and get command line arguments
parser = argparse.ArgumentParser()
# long name preceded by --, short by single -, get it as an integer so can use to access test data array
parser.add_argument('--do_exp', '-e', type=int, help=f'Which experiment to run (1-{max_tst})')
parser.add_argument('--c_cnt', '-c', type=int, help=f'Number of countries to use per randome sample (1-{max_size})')
parser.add_argument('--rpts', '-r', type=int, help=f'How many sample repetitions to use for current run (1-{max_rpt})')
parser.add_argument('--stat', '-s', type=int, help=f'Which sample statistic to display (1-{max_smpl}), if applicable')
parser.add_argument('--year', '-y', help=f'Which year to use for sample run (1950-2100)')
parser.add_argument('--e7_nbr', '-n', help=f'File number for experiment 7, used to create file name')
parser.add_argument('--e7_multiples', '-m', type=int, help=f'number of times (multiples) to run experiment 7')
args = parser.parse_args()
if args.do_exp:
if (args.do_exp >= 1 and args.do_exp <= max_tst):
n_tst = args.do_exp
else:
_DO_SAMPLING = False
if args.c_cnt and (args.c_cnt >= 1 and args.c_cnt <= max_size):
t_size = args.c_cnt
if args.rpts and (args.rpts >= 1 and args.rpts <= max_rpt):
t_rpt = args.rpts
if args.stat and (args.stat >= 1 and args.stat <= max_smpl):
t_stat = args.stat - 1
if args.year and (int(args.year) >= 1950 and int(args.year) <= 2100):
use_yr = args.year
if args.e7_nbr:
e7_f_nbr = args.e7_nbr
#print(f"n_tst: {n_tst}, t_size: {t_size}, t_rpt: {t_rpt}, t_stat: {t_stat}, use_yr: {use_yr}")
if _BLD_DB_CC_CSV:
s_tm = time.time()
rcn.make_db_name_id_csv()
e_tm = time.time()
print(f"building {_LST_CC_NM} took {e_tm - s_tm}")
if _BLD_MOD_CSV:
s_tm = time.time()
rcn.make_mod_country_csv()
e_tm = time.time()
diff_tm = e_tm - s_tm
print(f"modifying and saving dataframe took {diff_tm:.2f}")
if _DO_SAMPLING:
lst_means = []
lst_wt_means = []
lst_summed_mdns = []
s_medians = {}
s_totals = {}
sumd_mds = 0
mean_means, sd_means, serr_means, terr_means = (0, 0, 0, 0)
mean_wtd, sd_wtd, serr_wtd, terr_wtd = (0, 0, 0, 0)
mean_sumd, sd_sumd, serr_sumd, terr_sumd = (0, 0, 0, 0)
h_lists = [[], [], []]
sd_tm = time.time()
df_country = pd.read_csv(D_PATH / _CSV_MOD_CC, comment='#', usecols = ['continent','country','valid'])
valid_c = df_country['valid'].tolist()
valid_c.append("World")
valid_c.sort()
c_data = pdb.get_crs_1yr_all(valid_c, use_yr)
ed_tm = time.time()
diff_tm = ed_tm - sd_tm
print(f"loading CSV and getting data from population database took {diff_tm:.2f} ({sd_tm:.6f}-{ed_tm:.6f})")
# get world median age for current year
w_median = dstat.get_wt_quartile(_L_LOWS, c_data['World'])
#w_median = dstat.get_wt_mean(_L_MIDS, c_data['World'])
if t_rpt == 1:
s_medians, s_totals, mean_wtd, mean_sumd = get_single_sample(t_size)
mean_means = statistics.mean(s_medians.values())
sd_means = statistics.stdev(s_medians.values())
serr_means = 0
terr_means = t_conf_int(t_size, sd_means)
sd_wtd = np.sqrt(np.cov(list(s_medians.values()), aweights=list(s_totals.values())))
serr_wtd = 0
terr_wtd = t_conf_int(t_size, sd_wtd)
sd_sumd = sd_wtd
serr_sumd = 0
terr_sumd = t_conf_int(t_size, sd_sumd)
else:
for _ in range(t_rpt):
mean_mds, wt_mean_mds, summed_md = get_sample(t_size)
#print(mean_mds, wt_mean_mds, summed_md)
lst_means.append(mean_mds)
lst_wt_means.append(wt_mean_mds)
lst_summed_mdns.append(summed_md)
mean_means = statistics.mean(lst_means)
sd_means = statistics.stdev(lst_means)
serr_means = 1.96 * sd_means / (t_rpt**0.5)
terr_means = t_conf_int(t_size, sd_means)
mean_wtd = statistics.mean(lst_wt_means)
sd_wtd = statistics.stdev(lst_wt_means)
serr_wtd = 1.96 * sd_wtd / (t_rpt**0.5)
terr_wtd = t_conf_int(t_size, sd_wtd)
mean_sumd = statistics.mean(lst_summed_mdns)
sd_sumd = statistics.stdev(lst_summed_mdns)
serr_sumd = 1.96 * sd_sumd / (t_rpt**0.5)
terr_sumd = t_conf_int(t_size, sd_sumd)
h_lists = [lst_means, lst_wt_means, lst_summed_mdns]
# calc 95% confidence intervals
means_p95 = mean_means + serr_means
means_m95 = mean_means - serr_means
means_t_p95 = mean_means + terr_means
means_t_m95 = mean_means - terr_means
wtd_p95 = mean_wtd + serr_wtd
wtd_m95 = mean_wtd - serr_wtd
wtd_t_p95 = mean_wtd + terr_wtd
wtd_t_m95 = mean_wtd - terr_wtd
sumd_p95 = mean_sumd + serr_sumd
sumd_m95 = mean_sumd - serr_sumd
sumd_t_p95 = mean_sumd + terr_sumd
sumd_t_m95 = mean_sumd - terr_sumd
es_tm = time.time()
diff_tm = es_tm - ed_tm
print(f"\ngenerating sample stats took {diff_tm:.2f} ({ed_tm:.6f}-{es_tm:.6f})")
print(f"\nTest #{n_tst}: {tst_desc[n_tst]}\n")
# display appropriate data in terminal window
if n_tst != 6:
if t_rpt == 1:
display_stats_1_sample()
else:
display_stats_w_rpts()
if n_tst == 1:
fig, axs = plt.subplots(2, 2, sharey=True, gridspec_kw={'hspace': .3}, figsize=(15,8))
axs[1, 1].axis('off')
fig.suptitle(f"Estimated World Average Age: {use_yr}")
p_row = 0
for i in range(len(h_lists)):
plot_hist(axs[p_row, i % 2], h_lists[i], use_yr, smpl_types[i], t_size, t_rpt)
if i > 0 and i % 2 == 1:
p_row += 1
elif n_tst == 2:
pass
elif n_tst == 3:
pass
elif n_tst == 4:
z_95s = [(means_m95, means_p95, mean_means), (wtd_m95, wtd_p95, mean_wtd), (sumd_m95, sumd_p95, mean_sumd)]
t_95s = [(means_t_m95, means_t_p95, mean_means), (wtd_t_m95, wtd_t_p95, mean_wtd), (sumd_t_m95, sumd_t_p95, mean_sumd)]
for i in range(len(smpl_types)):
in_z_95_low = 0
in_z_95_high = 0
in_t_95_low = 0
in_t_95_high = 0
for mm in h_lists[i]:
if mm >= z_95s[i][0] and mm <= z_95s[i][1]:
if mm < z_95s[i][2]:
in_z_95_low += 1
else:
in_z_95_high += 1
if mm >= t_95s[i][0] and mm <= t_95s[i][1]:
if mm < z_95s[i][2]:
in_t_95_low += 1
else:
in_t_95_high += 1
print(f"\n{(in_z_95_low + in_z_95_high)/t_rpt*100:.2f}% of samples fall within Z 95% confidence interval for {smpl_types[i]}")
print(f"\t{in_z_95_low} below the mean and {in_z_95_high} above")
print(f"{(in_t_95_low + in_t_95_high)/t_rpt*100:.2f}% of samples fall within t 95% confidence interval for {smpl_types[i]}")
print(f"\t{in_t_95_low} below the mean and {in_t_95_high} above")
elif n_tst == 5:
fig, axs = plt.subplots(2, 2, figsize=(15,9), sharey=True)
# x_lbl = kwargs.get('x_lbl', x_def)
# p_ttl = kwargs.get('title', ttl_def)
# s_mean = kwargs.get('mean', None)
# s_stdev = kwargs.get('stdev', None)
# s_std_err = kwargs.get('std_err', None)
fig.suptitle(f"Estimated World Average Age for {use_yr} ({t_size} countries, one time)")
kargs = {
'title': '',
'x_lbl': f'Average Age (Median Age)',
'mean': mean_means,
'm_lbl': "Est. Average Age (simple mean)",
'stdev': sd_means,
'std_err': terr_means
}
plot_hist_basic(axs[0, 0], list(s_medians.values()), **kargs)
kargs = {
'title': '',
'x_lbl': f'Average Age (Median Age)',
'mean': mean_wtd,
'm_lbl': "Est. Average Age (weighted mean)",
'stdev': sd_wtd,
'std_err': terr_wtd
}
plot_hist_basic(axs[0, 1], list(s_medians.values()), **kargs)
kargs = {
'title': '',
'x_lbl': f'Average Age (Median Age)',
'mean': mean_sumd,
'm_lbl': "Est. Average Age (median of sum)",
'stdev': sd_sumd,
'std_err': terr_sumd
}
plot_hist_basic(axs[1, 0], [mean_sumd], **kargs)
elif n_tst == 6:
#print(f"t_stat = {t_stat}")
fig, axs = plt.subplots(2, 2, figsize=(15,12), sharey=True)
fig.suptitle(f"Estimated World Average Age for {use_yr} (various sample sizes, one time)")
ttl_stat = ['simple mean', 'weighted mean', 'median of sum']
print(f"Random sample of varying numbers of countries, {ttl_stat[t_stat]}, one time")
print(f"\n{'Sample Size':14} | {'Avg':>5} | {'std dev'} | {'std err':7} | {'t 95% conf':13}")
print(f"{'':14} | {'':5} | {'':7} | {'':7} | {'':<11}")
r_nbr = 0
for i in range(4):
if i > 0 and i % 2 == 0:
r_nbr += 1
c_nbr = t_size + (i * 10)
p_medians, p_tots, p_mean, p_sd, p_se = all_stat_sng_sample(c_nbr)
m95 = p_mean[t_stat] - p_se[t_stat]
p95 = p_mean[t_stat] + p_se[t_stat]
int_95 = f"{m95:.2f} - {p95:.2f}"
print(f"{c_nbr:14} | {p_mean[t_stat]:5.2f} | {p_sd[t_stat]:7.2f} | {p_se[t_stat]:7.2f} | {int_95}")
kargs = {
'title': f'Sample of {len(p_medians)} countries',
'x_lbl': f'Average Age (Median Age)',
'mean': p_mean[t_stat],
'm_lbl': f"Est. Average Age ({ttl_stat[t_stat]})",
'stdev': p_sd[t_stat],
'std_err': p_se[t_stat]
}
if t_stat == 0:
plot_hist_basic(axs[r_nbr, i % 2], list(p_medians.values()), **kargs)
elif t_stat == 1:
plot_hist_basic(axs[r_nbr, i % 2], list(p_medians.values()), **kargs)
elif t_stat == 2:
plot_hist_basic(axs[r_nbr, i % 2], [p_mean[t_stat]], **kargs)
print(f"{'World median:':14} | {w_median:.2f}")
elif n_tst == 7:
# https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.bar.html
# https://matplotlib.org/3.1.0/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
t7_data_file = f't7_samples_{e7_f_nbr}.txt'
t7_rpt = 10
t7_size = 33
t7_runs = 1
t7_stat = 0
t7_means = []
t7_stderrs = []
t7_seed = 349
if args.stat and (args.stat >= 1 and args.stat <= max_smpl):
t7_stat = args.stat - 1
if args.rpts and (args.rpts >= 1 and args.rpts <= max_rpt):
t7_rpt = args.rpts
if args.c_cnt and (args.c_cnt >= 1 and args.c_cnt <= max_size):
t7_size = args.c_cnt
if args.e7_multiples and (args.e7_multiples >= 1 and args.e7_multiples <= 20):
t7_runs = args.e7_multiples
def print_tbl(s_means, s_serrs):
print(f"Sample means +- std err for {t7_rpt} samples of size {t7_size}")
for i, s_mean in enumerate(s_means):
if i > 0:
print(" | ", end='')
print(f"{s_mean:.1f} +- {s_serrs[i]:.1f}", end='')
print()
missed_interval = 0
missed_1st_95 = 0
tmp_seed = t7_seed
for rnbr in range(1, t7_runs + 1):
tmp_seed += ((rnbr - 1) * 10031)
t7_means = []
t7_stderrs = []
for i in range(t7_rpt):
tmp_seed += (i * 13)
p_medians, p_tots, p_mean, p_sd, p_se = all_stat_sng_sample(t7_size, df_seed=tmp_seed)
t7_means.append(p_mean[t7_stat])
t7_stderrs.append(p_se[t7_stat])
missed_interval = 0
missed_1st_95 = 0
fst_m95 = t7_means[0] - t7_stderrs[0]
fst_p95 = t7_means[0] + t7_stderrs[0]
for i, mean in enumerate(t7_means):
if w_median < mean - t7_stderrs[i] or w_median > mean + t7_stderrs[i]:
missed_interval += 1
if mean < fst_m95 or mean > fst_p95:
missed_1st_95 += 1
print_tbl(t7_means, t7_stderrs)
with open(P_PATH / t7_data_file, 'a') as fl_dbug:
fl_dbug.write(f"'{rnbr}': {{\n")
fl_dbug.write(f" 'means': {t7_means},\n")
fl_dbug.write(f" 'stderrs': {t7_stderrs},\n")
fl_dbug.write(f" 'missed': ({missed_interval}, {missed_1st_95})\n")
fl_dbug.write(f" }},\n")
ind = np.arange(len(t7_means)) # the x locations for the groups
width = 0.7 # the width of the bars
f_wd = 10
f_ht = 6
if t7_rpt > 19:
f_wd = 15
f_ht = 9
elif t7_rpt > 34:
f_wd = 20
f_ht = 12
fig, ax = plt.subplots(figsize=(f_wd,f_ht))
rects1 = ax.bar(ind, t7_means, width, yerr=t7_stderrs)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel(f'Est. Average Age ({smpl_types[t7_stat]})')
ax.set_title(f'Estimated Average Age from {t7_rpt} samples of size {t7_size}')
ax.set_xticks(ind)
m95_1 = t7_means[t7_stat] - t7_stderrs[t7_stat]
p95_1 = t7_means[t7_stat] + t7_stderrs[t7_stat]
ax.axhspan(m95_1, p95_1, facecolor='green', alpha=0.2)
ax.axhline(w_median, color='r')
ax.axvline(missed_interval, 0, 0, color='w', label=f'world avg not in conf interval: {missed_interval}')
#ax.axvline(missed_1st_95, 0, 0, color='w', label=f'sample means not in first samples 95% conf: {missed_1st_95}')
ax.legend()
def autolabel(rects, xpos='center'):
"""
Attach a text label above each bar in *rects*, displaying its height.
*xpos* indicates which side to place the text w.r.t. the center of
the bar. It can be one of the following {'center', 'right', 'left'}.
"""
ha = {'center': 'center', 'right': 'left', 'left': 'right'}
offset = {'center': 0, 'right': 1, 'left': -1}
for rect in rects:
height = rect.get_height()
ax.annotate(f'{height:.1f}',
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(offset[xpos]*3, 3), # use 3 points offset
textcoords="offset points", # in both directions
ha=ha[xpos], va='bottom')
autolabel(rects1, "left")
elif n_tst == 8:
import t7_samples_3_tst as t7sr
#print(t7sr.sample_runs)
table = []
table.append('Sample |')
for i in range(len(t7sr.sample_runs['1']['means'])):
table.append(f'Sample {(i + 1):2} | ')
for i in range(len(t7sr.sample_runs['1']['means'])):
pass
print("\n".join(table))
elif n_tst == max_tst:
fig, ax = plt.subplots(figsize=(10,6))
fig.suptitle(f"Estimated World Average Age: {use_yr}")
plot_hist(ax, h_lists[t_stat], use_yr, smpl_types[t_stat], t_size, t_rpt)
es_tm = time.time()
diff_tm = es_tm - sd_tm
print(f"\nwhole sampling process took {diff_tm:.2f}")
plt.show()
t7_build_table.py
This one not likely of much interest to you, but just in caseā¦
"""Code for generating HTML tables and/or plots for posts world_avg_age_*/*.
Uses data generated by experiment #7 in est_avg_age.py
"""
import sys
from pathlib import Path
import argparse
import matplotlib.pyplot as plt
file = Path(__file__).resolve()
parent, root = file.parent, file.parents[0]
#print(file.parents[0])
#print(f"parent: {parent}; root: {root}\n")
sys.path.append(str(root))
# Additionally remove the current file's directory from sys.path
try:
sys.path.remove(str(parent))
except ValueError: # Already removed
pass
min_tst = 2
max_tst = 5
do_tst = 2
# define and get command line arguments
parser = argparse.ArgumentParser()
# long name preceded by --, short by single -, get it as an integer so can use to access test data array
parser.add_argument('--do_tst', '-t', type=int, help=f'Which test run to use ({min_tst}-{max_tst})')
args = parser.parse_args()
if args.do_tst:
if (args.do_tst >= min_tst and args.do_tst <= max_tst):
do_tst = args.do_tst
else:
print(f"Number for test run, {args.do_tst}, not valid, should be {min_tst}-{max_tst}")
import t7_samples_2_tst as t7ts
if do_tst == 1:
exit(1)
elif do_tst == 2:
import t7_samples_2_tst as t7ts
elif do_tst == 3:
import t7_samples_3_tst as t7ts
elif do_tst == 4:
import t7_samples_4_tst as t7ts
elif do_tst == 5:
print(f"importing t7_samples_5_tst")
import t7_samples_5_tst as t7ts
if True:
#print(t7ts.sample_runs)
row_tbl = []
t_out = f"r:/learn/py_play/population/play/t7_tbl_out_{do_tst}.html"
with open(t_out, 'w') as tbl_out:
tbl_out.write('<table>\n')
row_tbl.append("<tr><th> </th>")
nbr_samples = len(t7ts.sample_runs['1']['means'])
l_runs = list(t7ts.sample_runs.keys())
for i in range(nbr_samples):
s_mean = f"{t7ts.sample_runs['1']['means'][i]:.1f}"
s_err = f"{t7ts.sample_runs['1']['stderrs'][i]:.1f}"
row_tbl.append(f"<tr><th>Sample {i + 1}</th>")
#row_tbl.append(f"<tr><th>Sample {i + 1}:</th><td>{s_mean} ±{s_err}</td>")
row_tbl.append("<tr><th>Run Mean</th>")
row_tbl.append("<tr><th>True Missed</th>")
row_tbl.append("<tr><th>In 1st CI</th>")
for run, data in t7ts.sample_runs.items():
row_tbl[0] += f"<th>Run {run}</th>"
for i in range(nbr_samples):
s_mean = f"{t7ts.sample_runs[run]['means'][i]:.1f}"
s_err = f"{t7ts.sample_runs[run]['stderrs'][i]:.1f}"
row_tbl[i + 1] += f"<td>{s_mean} ±{s_err}</td>"
avg_means = sum(t7ts.sample_runs[run]['means']) / nbr_samples
row_tbl[nbr_samples + 1] += f"<td>{avg_means:.1f}</td>"
row_tbl[nbr_samples + 2] += f"<td>{t7ts.sample_runs[run]['missed'][0]:>1}</td>"
row_tbl[nbr_samples + 3] += f"<td>{t7ts.sample_runs[run]['missed'][1]:>1}</td>"
for row in row_tbl:
tbl_out.write(f"{row}</tr>\n")
tbl_out.write("</table>\n")
# plot histogram of all samples
def plot_hist_basic(ax, h_data, **kwargs):
# Defaults
x_def = 'Age'
y_def = 'Count'
ttl_def = f"Simulation: {len(h_data)} countries, no repeats"
mean_lbl_def = "Est. Average Age"
rng_min = 22
rng_max = 34
nbr_bin = 48
do_lgnd = False
# Optional keyworded arguments and derived values
x_lbl = kwargs.get('x_lbl', x_def)
y_lbl = kwargs.get('y_lbl', y_def)
p_ttl = kwargs.get('title', ttl_def)
s_mean = kwargs.get('mean', None)
mean_lgnd = kwargs.get('m_lbl', mean_lbl_def)
s_stdev = kwargs.get('stdev', None)
s_std_err = kwargs.get('std_err', None)
m95 = None
p95 = None
if s_mean and s_std_err:
m95 = s_mean - s_std_err
p95 = s_mean + s_std_err
pop_mean = kwargs.get('pop_mean')
# now onto plotting
ax.set_xlabel(x_lbl)
ax.set_ylabel(y_lbl)
ax.set_title(p_ttl)
n, bins, patches = ax.hist(h_data, bins=nbr_bin, range=(rng_min, rng_max))
if s_mean:
ax.axvline(s_mean, 0, 1, color='r', label=f'{mean_lgnd}: {s_mean:.2f}')
if m95:
ax.axvline(m95, 0, 1, color='hotpink', ls='-.', label=f'min 95% conf interval: {m95:.2f}')
if p95:
ax.axvline(p95, 0, 1, color='orange', ls='-.', label=f'max 95% conf interval: {p95:.2f}')
# not actually plotting anything as such, just want value to show in legend area
if s_stdev:
ax.axvline(s_stdev, 0, 0, color='w', label=f'std dev: {s_stdev:.2f}')
# not actually plotting anything as such, just want value to show in legend area
if s_std_err:
ax.axvline(s_std_err, 0, 0, color='w', label=f'std err: {s_std_err:.2f}')
# ax.axvline(w_median, 0, 0, color='k', label=f'World Average: {w_median:.2f}')
# display legend if appropriate
if pop_mean:
ax.axvline(pop_mean, 0, 1, color='aqua', label=f'Population average: {pop_mean:.2f}')
if s_mean or s_stdev or s_std_err:
ax.legend()
all_smpl = []
for run, data in t7ts.sample_runs.items():
all_smpl += t7ts.sample_runs[run]['means']
nbr_smpl = len(all_smpl)
#print(all_smpl)
#fig, axs = plt.subplots(2, 2, figsize=(15,9), sharey=True)
f_wd = 10
f_ht = 6
fig, ax = plt.subplots(figsize=(f_wd,f_ht))
# x_lbl = kwargs.get('x_lbl', x_def)
# p_ttl = kwargs.get('title', ttl_def)
# s_mean = kwargs.get('mean', None)
# s_stdev = kwargs.get('stdev', None)
# s_std_err = kwargs.get('std_err', None)
fig.suptitle(f"Estimated World Average Age of {nbr_smpl} Samples")
world_avg = 28.68
smpl_mean = sum(all_smpl) / nbr_smpl
rnd_mean = f"{smpl_mean:.2f}"
plt_mean = float(rnd_mean)
kargs = {
'title': '',
'mean': plt_mean,
'm_lbl': 'Mean of all samples:',
'pop_mean': world_avg
}
plot_hist_basic(ax, all_smpl, **kargs)
plt.show()