import pandas
import math
%matplotlib inline
rename_dict = {}
for i in range(1995, 2015):
rename_dict[' ' + str(i)] = str(i)
country_code2country = {}
f = open("who-mortality/country_codes.csv")
f.readline()
for s in f.readlines():
splitted = s.rstrip().split(',')
code = splitted[0]
name = ','.join(splitted[1:])
country_code2country[int(code)] = name
f.close()
article_subjects = ["genetics", "psychology", "physiology", "molecular"]
sex2int = {"Persons" : 3, "Male" : 1, "Female" : 2}
disease = pandas.read_excel("who-desease-burden/GHE2015_YLD-2015-country.xls", sheet_name=1, header=6)
mortality = pandas.read_csv("who-mortality/mort-no-nan-col.csv")
causes_of_death = pandas.read_csv("who-mortality/causes-of-death.csv", sep=";")
articles = pandas.DataFrame()
for subj in article_subjects:
for year in range(2007, 2016):
current_frame = pandas.read_excel("scimago-articles/scimago/data/{}/{}.xls".format(subj, year))
current_frame["Year"] = (pandas.Series([year] * len(current_frame)))
current_frame["Subject"] = (pandas.Series([subj] * len(current_frame)))
articles = articles.append(current_frame)
i = 0
def choose_last_value(a):
a, b, c, d = a
if pandas.notnull(d):
return d
elif pandas.notnull(c):
return c
elif pandas.notnull(b):
return b
return a
mortality["Country"] = mortality["Country"].map(country_code2country.get)
health_exp = pandas.read_csv("who-health-expenditure/exp-years-percent.csv", header=1)
health_exp = health_exp.rename(columns=rename_dict)
health_exp = health_exp.set_index("Country")
health_exp = health_exp.drop(["Bolivia (Plurinational State of)"])
gdp = pandas.read_csv("who-health-expenditure/worldGDP.csv", header = 2)
gdp = gdp.drop(gdp.loc[:, "Country Code": "1990"], axis=1)
del gdp["Unnamed: 62"]
del gdp["2017"] # empty column
gdp = gdp.set_index("Country Name")
gdp.loc["Bolivia (Plurinational State of)"] = gdp.loc["Bolivia"]
gdp.loc["Bahamas"] = gdp.loc["Bahamas, The"]
gdp.loc["Côte d'Ivoire"] = gdp.loc["Cote d'Ivoire"]
gdp.loc["Congo"] = gdp.loc["Congo, Rep."]
gdp.loc["Czechia"] = gdp.loc["Czech Republic"]
gdp.loc["United States of America"] = gdp.loc["United States"]
gdp.loc["Egypt"] = gdp.loc["Egypt, Arab Rep."]
gdp.loc['United Kingdom of Great Britain and Northern Ireland'] = gdp.loc["United Kingdom"]
# different sources differ in countries' names
health_exp_abs_pc = pandas.DataFrame(health_exp).copy()
broken_names = []
for i in health_exp.index:
for j in health_exp.columns:
try:
health_exp_abs_pc.loc[j, i] *= gdp[j][i]
#health_exp_abs_pc.loc[j, i] /= basic["Population (\'000) (2)"][j]
health_exp_abs_pc.loc[j, i] = math.log(float(health_exp_abs_pc[j][i]))
except:
#print(i, j)
broken_names.append(i)
#'''
disease_names = disease.loc[:, "GHE cause":"Member State\n(See Notes for explanation of colour codes)"]
disease["Name"] = disease_names.apply(choose_last_value, axis=1)
basic = disease[disease["GHE cause"].notnull()]
basic["Sex"] = pandas.DataFrame(basic["Sex"].map(sex2int))
basic = basic.drop(basic.loc[:, "Unnamed: 2": "Member State\n(See Notes for explanation of colour codes)"].columns, axis=1)
basic = basic[basic["Sex"].notnull()]
basic = basic.set_index("Name").transpose()
#basic = disease.loc[:, "Afghanistan":"Zimbabwe"][2:15]
#basic = basic.transpose()
for i in basic.columns:
try:
basic[i] = pandas.to_numeric(basic[i])
basic[i + " r"] = basic[i] / basic["Population (\'000) (2)"] * 1000
except:
del basic[i]
filtered_small = basic[basic.iloc[:, 1] < 100000]
#filtered_small.plot.scatter(x="Population ('000) (2)", y="Digestive diseases")
filtered_large = basic[basic.iloc[:, 1] > 100000].transpose()
filtered_large_comm = filtered_large.iloc[2:8] # all communicable diseases
filtered_large_comm = filtered_large_comm[1:]
plcot_1 = filtered_large_comm.plot.barh(by = ["China", "India"])
filtered_large_ncomm = filtered_large.loc["Noncommunicable diseases":"Oral conditions"]
filtered_large_ncomm = filtered_large_ncomm[1:]
plot_2 = filtered_large_ncomm.plot.bar(by = ["China", "India"], log=True)
filtered_large_comm.to_json("../docs/resources/india_china_comm.json")
filtered_large_ncomm.to_json("../docs/resources/india_china_ncomm.json")
В Индии население немногим меньше, чем в Китае, в обеих странах более миллиарда жителей. Сравним показатели заболеваемостей.
oncology = basic.sort_values(by="Malignant neoplasms r", ascending = False)
max_relative_neoplasms = oncology.iloc[1:20] #first is sex (garbage)
max_relative_neoplasms.plot.bar(by = "", y="Malignant neoplasms r", legend=False, title="Malignant neoplasms, ‰")
max_relative_neoplasms["Malignant neoplasms r"].to_json("../docs/resources/neoplasms.json")
Мы видим, что больше всего люди страдают от злокачественных опухолей в продвинутых, экономически благополучных европейских странах. Барбадос?
mental_disorders = basic.sort_values(by="Mental and substance use disorders r", ascending = False)
max_relative_mental_burden = mental_disorders.iloc[1:15] #first is sex (garbage)
max_relative_mental_burden.plot.bar(by = "", y="Mental and substance use disorders r", legend=False, title="Mental burden, ‰")
max_relative_mental_burden["Mental and substance use disorders r"].to_json("../docs/resources/mental.json")
В этом графике очень много стран бывшего Советского Союза по сравнению с предыдущим
($ per capita)
и уровень заболеваемости¶burden_and_expenditure = (basic.transpose().append(health_exp_abs_pc["2014"])).transpose()
burden_and_expenditure = (burden_and_expenditure.transpose().append(health_exp_abs_pc["2009"])).transpose()
noncomm_expenditure = burden_and_expenditure.plot.scatter(x="All Causes r", y='2014')
burden_and_expenditure.plot.scatter(x="All Causes r", y='2009', ax=noncomm_expenditure, color="red")
diff = pandas.Series()
min_exp = pandas.Series()
max_exp = pandas.Series()
for i in health_exp_abs_pc.index:
min_exp[i] = health_exp_abs_pc.loc[i, "2006"]
max_exp[i] = health_exp_abs_pc.loc[i, "2006"]
for j in health_exp_abs_pc.columns:
min_exp[i] = min(min_exp[i], health_exp_abs_pc.loc[i, j])
max_exp[i] = max(max_exp[i], health_exp_abs_pc.loc[i, j])
diff[i] = max_exp[i] - min_exp[i]
diff = diff.sort_values()
lowest_exp = pandas.DataFrame()
for i in diff[:7].index:
lowest_exp[i] = health_exp_abs_pc.loc[i]
lowest_exp["Year"] = pandas.to_numeric(lowest_exp.index)
lowest_exp = lowest_exp[lowest_exp["Year"] % 2 == 0]
lowest_exp.plot.line(x = "Year")
highest_exp = pandas.DataFrame()
for i in diff[-7:].index:
highest_exp[i] = health_exp_abs_pc.loc[i]
highest_exp.loc["2012":"2002", :].to_json("../docs/resources/highest-exp.json")
highest_exp["Year"] = pandas.to_numeric(highest_exp.index)
highest_exp = highest_exp[highest_exp["Year"] % 2 == 0]
highest_exp.plot.line(x = "Year")
diff["Russia"] = diff["Russian Federation"]
diff["Czech Republic"] = diff["Czechia"]
diff["United Kingdom"] = diff["United Kingdom of Great Britain and Northern Ireland"]
diff["Iran"] = diff["Iran (Islamic Republic of)"]
diff["Venezuela"] = diff["Venezuela (Bolivarian Republic of)"]
diff["Vietnam"] = diff["Viet Nam"]
diff.to_json("../docs/resources/exp-map.json")
Посмотрели на страны с наибольшей разницей вклада в медицину между 2006 и 2014 годом, такой странный пик у этих стран в 2006 году. А в стране Niue в 2012 году видим, что на одного человека в год тратили 5$, а в `2006 - 40$`. Воспользовавшись поиском, находим, что в 2004 году Циклон Хета произвёл на острове большие разрушения и надо было строить новый госпиталь (это маленькое островное гос-во, с территорией 18х23 км), госпиталей мало.
Посмотрели на другую сторону таблицы, и видим, что тут никаких резких отклонений нет. Эти страны не зря называются развивающимися: расходы на медицину хоть и колеблются, но имеют тенденцию расти.
#for i in diff.index:
# mortality.loc[i, "Exp Difference"] = diff[i]
#mortality.sort_values("Exp Difference")[:10].plot.bar(y = "All Causes r")
mortality_homicide = mortality[(("X85" <= mortality["Cause"]) & (mortality["Cause"] <= "Y09")) | (mortality["Cause"] == "UE64")]
homicide_total = pandas.DataFrame()
countries = set(mortality_homicide["Country"])
years = range(2005, 2016)
for c in countries:
for y in years:
homicide_total.loc[c, y] = mortality_homicide[(mortality_homicide["Country"] == c) & (mortality_homicide["Year"] == y)]["Deaths1"].sum()
if homicide_total.loc[c, y] == 0 and y > 2005:
homicide_total.loc[c, y] = homicide_total.loc[c, y - 1]
homicide_diff = homicide_total[2015].copy()
for i in homicide_diff.index:
homicide_diff[i] = int(homicide_diff[i]) - homicide_total.loc[i, 2005]
homicide_diff = homicide_diff.sort_values()
top_homicide = pandas.DataFrame()
for i in range(-7, 7):
top_homicide[homicide_diff.index[i]] = pandas.to_numeric(homicide_total.loc[homicide_diff.index[i]])
print(homicide_diff.index[i])
for i in top_homicide.index:
top_homicide.loc[i, "Year"] = int(i)
top_homicide.plot.line(x = "Year")
top_homicide.to_html("../docs/min_max_homicides.html")
Среди стран, в которых убийств стало больше, много стран Центральной и Латинской Америки. В Бразилии, например, стало на 20% больше (смотри таблицу на сайте). Но в то же время в Колумбии их количество падает. Также и в Мексике, пик пришёлся на 2011 год. В 2006 году к власти пришёл новый президент, за 6 лет его правления количество насильственных смертей выростло в 2 раза, в 2012 году власть сменилась, и преступность стала падать.
mortality_undetermined = mortality[(("Y10" <= mortality["Cause"]) & (mortality["Cause"] <= "Y34")) | (mortality["Cause"] == "UE65")]
mortality_undetermined["Deaths1"] = pandas.to_numeric(mortality_undetermined["Deaths1"])
mortality_undetermined = mortality_undetermined.sort_values("Deaths1", ascending=False)
undetermined_countries = set()
for i in list(mortality_undetermined["Country"]):
if (len(undetermined_countries) > 10):
break
undetermined_countries.add(i)
#for i in
undetermined_c_df = pandas.DataFrame()
for i in undetermined_countries:
for j in years:
undetermined_c_df.loc[i, j] = mortality_undetermined[(mortality_undetermined["Country"] == i) &
(mortality_undetermined["Year"] == j)]["Deaths1"].sum()
undetermined_c_df.transpose().plot.line()
undetermined_c_df
Undetermined intent is only for use when there is specific documentation in the record that the intent of the injury cannot be determined.
Очень много очень сильных перепадов.
big_8 = ["Russian Federation", "United States of America", "United Kingdom of Great Britain and Northern Ireland",
"France", "Japan", "Germany", "Canada", "Italy"]
big_8_health_exp = pandas.DataFrame()
for i in big_8:
big_8_health_exp[i] = health_exp_abs_pc.loc[i]
big_8_health_exp["Year"] = pandas.to_numeric(big_8_health_exp.index)
big_8_health_exp = big_8_health_exp.rename(lambda x : "Great Britain" if x == big_8[2] else x, axis = 1)
big_8_health_exp.plot.line(x = "Year")
big_8_health_exp.loc[:, "Russian Federation":"Italy"].to_json("../docs/resources/health_exp.json")
big_8_health_exp
Неожиданный факт: в 2014 году в РФ такое же финансирование медицины, как и в 1996, и падает с 2006 года.
Неожиданный факт №2: в 2014 же году РФ перестала входить в её состав, но и данные у нас до 2014.
filtered_bae = burden_and_expenditure[burden_and_expenditure["Population (\'000) (2)"] < 1000000]
filtered_bae[10 > filtered_bae['2014']].plot.scatter(x="Population ('000) (2)", y="Infectious and parasitic diseases")
filtered_bae[10 <= filtered_bae['2014']].plot.scatter(x="Population ('000) (2)", y="Infectious and parasitic diseases")
#filtered_bae
Графики (страны разделены по низкому бюджету медицины и высокому), на которых видим, что в двух странах очень сильные всплески инфекционных заболеваний, что же это за страны?
filtered_bae[filtered_bae["Infectious and parasitic diseases"] > 2000]
burden_and_expenditure["Infectious and parasitic diseases r"]["Russia"] = filtered_bae["Infectious and parasitic diseases r"]["Russian Federation"]
#filtered_bae["Infectious and parasitic diseases r"]["Czech Republic"] = filtered_bae["Infectious and parasitic diseases r"]["Czechia"]
#filtered_bae["Infectious and parasitic diseases r"]["United Kingdom"] = filtered_bae["Infectious and parasitic diseases r"]["United Kingdom of Great Britain and Northern Ireland"]
burden_and_expenditure["Infectious and parasitic diseases r"]["Iran"] = filtered_bae["Infectious and parasitic diseases r"]["Iran (Islamic Republic of)"]
burden_and_expenditure["Infectious and parasitic diseases r"]["Venezuela"] = filtered_bae["Infectious and parasitic diseases r"]["Venezuela (Bolivarian Republic of)"]
burden_and_expenditure["Infectious and parasitic diseases r"]["Vietnam"] = filtered_bae["Infectious and parasitic diseases r"]["Viet Nam"]
burden_and_expenditure["Infectious and parasitic diseases r"]["Afghanistan":].to_json("../docs/resources/infectious.json")
Странно, что сюда не попали другие африканские страны
articles_total = pandas.DataFrame()
articles_subject = {}
#artticles[articles["Year"] == year & articles["Subject"] == subj]
for subj in article_subjects:
#articles_total[subj] = pandas.Series([0] * 10)
articles_subject[subj] = articles[articles["Subject"] == subj]
for year in range(2007, 2016):
articles_total.loc[year, subj] = 0
for year in range(2007, 2016):
articles_total.loc[year, subj] += articles_subject[subj][articles_subject[subj]["Year"] == year]["Citable documents"].sum()
pass
articles_total.plot.line(title="Amount of citable artciles in the world").get_figure().savefig("../docs/articles_stats.png")
export_table = open("../docs/articles_stats.html", "w")
print(articles_total.to_html(), file=export_table)
export_table.close()
Как мы видим, количество статей по физиологии почти не меняется, а по остальным 3 из выбранных наук растёт. Хотя количество статей по генетике остановило рост в 2012 году. Странно.
genetics_2015 = articles_subject["genetics"][articles_subject["genetics"]["Year"] == 2015].sort_values("Citable documents", ascending=False)
top_20_g_countries = list(genetics_2015[:20]["Country"])
filtered_top_g = pandas.DataFrame()
for country in top_20_g_countries:
for year in range(2007, 2016):
filtered_top_g.loc[year, country] = math.log(articles_subject["genetics"][(articles_subject["genetics"]["Country"] == country)
& (articles_subject["genetics"]["Year"] == year)]["Citable documents"].iloc[0], 10)
filtered_top_g.plot.line(title="genetics articles' amount, lg") # TODO нормально это нарисовать ?!
Можем увидеть, что из США выходит наибольшее количество статей по генетике. Тем не менее, в последние годы их количество начало снижаться (с 2012, как раз, когда на пред. графике закончился рост), а количество статей из Китая быстро увеличивается. Если приглядеться, ещё значительный рост количества статей наблюдается у Швеции.