%reset -f
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import helpers as h
import numpy as np
import scipy as sp
import json, os
joined = pd.read_csv('cleaned-terms.csv',index_col=0)
pd.options.display.max_columns=400
pd.options.display.max_colwidth= 100
requiredPaths = ["tab", "img", "tex"]
for p in requiredPaths:
if not os.path.exists(p):
os.mkdir(p)
with open("descriptions-terms-cleaned.json", "r") as f:
descriptions = json.load(f)
newColumnNames = joined.columns.tolist()
joined[:2]
joined['interviewtime'].plot(kind="hist")
groupNames = [x for x in newColumnNames if x.startswith("groupTime")]
#time spend reading the terms:
joined.groupTime75.describe()
joined.info()
joined['catch5'].describe()
failedUnicorn = joined[joined['catch5'] != "Once per week"]
len(failedUnicorn)
failedUnicorn[["interviewtime"] + groupNames]
failedUnicorn.dropna(axis=1,how="all")
#still need coding:
joined[joined.termsUnclearCoded.isnull()]
joined[joined.phishingReasonCoded.isnull()]
joined.shape
import re
def descriptiveValueCounts(series, caption = "", reference = "", header=[], isFloat=True, save=True, folder="terms/tab",
reorder = []):
t = h.Table("lc", "lc")
t.isFloat = isFloat
if caption:
t.setCaption(caption)
if reference:
t.reference = reference
if header:
t.setHeader(header)
vc = series.value_counts()
for k, v in vc.iteritems():
t.addRow([k,"$%d$" %v])
if reorder:
t.rows = [t.rows[i] for i in reorder]
if save:
t.writeLatexToFile(path=folder)
return t
def descriptiveValuePercentages(data, caption = "", reference = "", header=[], isFloat=True, save=True,
folder="tab", reorder = [], addRows = []):
d = list(data.value_counts().iteritems())
languages = list(set([x[0][0] for x in d]))
degrees = sorted(list(set([x[0][1] for x in d])) + addRows)
if reorder:
degrees = [degrees[i] for i in reorder]
languageDistribution = [sum(x[1] for x in d if x[0][0]== lan) for lan in languages]
print languages
def getStat(lang, deg, dist):
t = [x[1] for x in d if x[0][0] == lang and x[0][1] == deg]
return 1.0*t[0]/dist*100 if t else 0.0
vc = [(deg, [getStat(lang, deg, dist) for dist, lang in zip(languageDistribution, languages)]) for deg in degrees]
t = h.Table("l" + "c"*len(languages), "l" + "c"*len(languages))
t.isFloat = isFloat
if caption:
t.setCaption(caption)
if reference:
t.reference = reference
if header:
t.setHeader(header)
for k, v in vc:
t.addRow([k] + ["$%.0f%%$" %vt for vt in v])
if save:
t.writeLatexToFile(path=folder)
return t
descriptiveValuePercentages(joined.groupby('startlanguage').DemoEducation, caption = "Educational Demogprahics",
reference="demoEducation", header=["Highest Qualification", "US", "DE", "UK"],
reorder = [2, 0, 6, 1, 4, 7, 3])
languagesShort = ['en-us', 'de', 'en']
languagesLong = ["US", "DE", "UK"]
mf = h.figuresToLaTeX(columns=3,basename='demoAge',path='',
caption='Histogram of our participants age')
for sl, ll in zip(languagesShort, languagesLong):
data = joined[joined.startlanguage == sl].demoAge
a = plt.figure(figsize=(4,4), dpi=80)
ax = data.plot(kind="hist",bins=np.arange(10,80,5))
plt.setp(ax.patches, 'facecolor', '0.3','edgecolor', '0.15', 'alpha', 0.75)
locs, labels = plt.xticks()
plt.xticks(locs,[r'$%g$' %x for x in locs],size='large')
locs, labels = plt.yticks()
plt.yticks(locs,[r'$%g$' %x for x in locs],size='large')
plt.xlabel('Age')
plt.ylabel('Frequency')
h.rstyle(ax)
mf.addFigure(a,subcaption="Country: %s" %ll,describeText=h.describe(data.values))
print h.describe(data.values)
mf.writeLaTeX()
descriptiveValuePercentages(joined.groupby('startlanguage').DemoEmployment,
caption="Employment Demogpraphics of the participants",
reference="demoEmployment", header=["Employment Status", "US", "DE", "UK"],
reorder = [0, 4, 5, 3, 2, 1])
descriptiveValuePercentages(joined.groupby('startlanguage').demoGender, caption="Gender of the participants",
reference="demoGender", header= ["Gender", "US", "DE", "UK"])
descriptiveValuePercentages(joined.groupby('startlanguage').DemoLearningDiff,
caption="Learning difficulty or disability of the participants",
reference="demoDisability", header=["Answer", "US", "DE", "UK"])
descriptiveValuePercentages(joined.groupby('startlanguage').demoLanguage,
caption="How proficient are you in English/German",
reference="demoLanguage", header=["Proficiency", "US", "DE", "UK"],
addRows=["No proficiency", "Elementary proficiency"], reorder = [5,2,3,1,0,4])
joined.demoNumberAccounts.describe()
mf = h.figuresToLaTeX(columns=3,basename='numberCards',path='',
caption='Participants number of payment cards')
for sl, ll in zip(languagesShort, languagesLong):
data = joined[joined.startlanguage == sl].demoNumberCards
a = plt.figure(figsize=(4,4), dpi=80)
ax = data.plot(kind="hist",bins=np.arange(0,11,1))
plt.setp(ax.patches, 'facecolor', '0.3','edgecolor', '0.15', 'alpha', 0.75)
locs, labels = plt.xticks()
plt.xticks(locs,[r'$%g$' %x for x in locs],size='large')
locs, labels = plt.yticks()
plt.yticks(locs,[r'$%g$' %x for x in locs],size='large')
plt.xlabel('Number of Payment Cards')
plt.ylabel('Frequency')
h.rstyle(ax)
mf.addFigure(a,subcaption="Country: %s" %ll,describeText=h.describe(data.values))
print h.describe(data.values)
mf.writeLaTeX()
mf = h.figuresToLaTeX(columns=3,basename='numberBankAccounts',path='',
caption='Participant\'s number of bank accounts')
for sl, ll in zip(languagesShort, languagesLong):
data = joined[joined.startlanguage == sl].demoNumberAccounts
a = plt.figure(figsize=(4,4), dpi=80)
ax = data.plot(kind="hist",bins=np.arange(0,16,1))
plt.setp(ax.patches, 'facecolor', '0.3','edgecolor', '0.15', 'alpha', 0.75)
locs, labels = plt.xticks()
plt.xticks(locs,[r'$%g$' %x for x in locs],size='large')
locs, labels = plt.yticks()
plt.yticks(locs,[r'$%g$' %x for x in locs],size='large')
plt.xlabel('Number of Bank Accounts')
plt.ylabel('Frequency')
h.rstyle(ax)
mf.addFigure(a,subcaption="Country: %s" %ll,describeText=h.describe(data.values))
print h.describe(data.values)
mf.writeLaTeX()
descriptiveValuePercentages(joined.groupby('startlanguage').demoCardsFrequency,
caption="Frequency of use of any of the participants payment cards",
reference="demoCardUseFrequency", header=["Frequency", "US", "DE", "UK"],
reorder = [0,5, 3, 2, 6, 4, 1])
descriptiveValuePercentages(joined.groupby('startlanguage').demoExperiencedFraud,
caption="Have you ever experienced fradulent transactions or incidents on any of your payment cards or bank accounts?",
reference="demoFraudExperienced", header=["Frequency", "US", "DE", "UK"])
# reorder = [0,5, 3, 2, 6, 4, 1])
def expandCoded(data, splitString):
t = data.str.split(splitString, expand=True)
t2 = pd.get_dummies(t[0])
for k in range(t.shape[1]-1):
t2 = t2.add(pd.get_dummies(t[k+1]),fill_value=0)
return t2
tdata = joined.demoFraudDescriptionCoded.str.split(',\W?',expand=True)
def expandedFreq(data, splitString):
tdata = data.str.split(splitString, expand=True)
fullsum = tdata[0].value_counts()
for i in range(1,tdata.shape[1]):
fullsum = fullsum.add(tdata[i].value_counts(),fill_value=0)
return fullsum
t3 = expandCoded(joined.demoFraudDescriptionCoded, ',\W?')
with open('demoFraudDescriptionCoded.txt', 'r') as f:
lines = f.readlines()
mapping = dict()
for key in t3.columns:
potentialLines = [x.strip()[len(key)+2:] for x in lines if x.strip().startswith(key)]
mapping[u"demoFraudDescriptionCoded_%s" %key] = u"%s" %potentialLines[0] if potentialLines else u""
descriptions.update(mapping)
t3.columns = ["demoFraudDescriptionCoded_%s" %i for i in t3.columns]
joined = pd.concat([joined, t3],axis=1)
tt = joined[joined.demoExperiencedFraud == "Yes"].groupby('startlanguage')[t3.columns]
freq = joined.groupby('startlanguage').demoFraudFrequency.sum()
freq = freq[[k for k in freq.keys() if freq[k] > 0]]
ttt = tt.sum().T.divide(freq)
ttt
def mcnemar_midp(b, c):
"""
Compute McNemar's test using the "mid-p" variant suggested by:
M.W. Fagerland, S. Lydersen, P. Laake. 2013. The McNemar test for
binary matched-pairs data: Mid-p and asymptotic are better than exact
conditional. BMC Medical Research Methodology 13: 91.
`b` is the number of observations correctly labeled by the first---but
not the second---system; `c` is the number of observations correctly
labeled by the second---but not the first---system.
"""
n = b + c
x = min(b, c)
dist = sp.stats.binom(n, .5)
p = 2. * dist.cdf(x)
midp = p - dist.pmf(x)
return midp
t = h.Table("lccc")
t.setHeader(["Code", "DE", "UK", "US"])
t.fromNPArray(ttt.values*100, "$%.1f%%$", rowdesc=[descriptions[x] for x in t3.columns])
t.setCaption("Thematic analysis of description of fraud experienced by participants. The first five codes describe the " + \
"identification of fraud, the next five codes describe the type of fraud, and the last two describe the " + \
"follow up actions that happened.")
t.reference = "demoFraudDescription"
t.writeLatexToFile(path='tab/')
t
## The two scenarios, does the person get their money back question.
af = joined['startlanguage'].value_counts()
af
freq = joined.groupby('startlanguage')[['phishingMoneyBack','phishingMoneyBackRev', 'lossMoneyBack', 'lossMoneyBackRev']]
ttt = freq.sum().T.divide(af)
significances = []
for a,b in [(joined.phishingMoneyBack,joined.phishingMoneyBackRev), (joined.lossMoneyBack, joined.lossMoneyBackRev)]:
ct = pd.crosstab(a,b)
pMcnemar = mcnemar_midp(ct[True][False],ct[False][True])
if pMcnemar < 0.05:
significances.append("significant with $p<%s$" %("0.01" if pMcnemar <0.01 else "0.05"))
else:
significances.append("not significant")
t = h.Table("lccc")
t.setHeader(["Question", "DE", "UK", "US"])
t.fromNPArray(ttt.values*100, "$%.1f%%$", rowdesc=["Scenario Phishing", "Scenario Phishing after T&Cs",
"Scenario Theft", "Scenario Theft after T&Cs"])
t.setCaption("Percentage of participants that say that the money should be returned in each of the scenarios. " + \
"McNemar's test is %s for the Scenario Phishing and is %s for the Scenario Theft." %tuple(significances))
t.reference = "scenarioMoneyReturned"
t.writeLatexToFile(path='tab/')
t
significances
descriptiveValuePercentages(joined.groupby('startlanguage').toolHeard,
caption="Have you ever used third party online banking services?",
reference="toolUsed", header=["Frequency", "US", "DE", "UK"], reorder = [1, 0])
#We did ask more questions here, but I think its pointless to analyse them here as the sample size is < 20.
# Lets look at some comprehension questions.
descriptiveValuePercentages(joined.groupby('startlanguage').termsConfidence,
caption="How confident are you that you have understood the T&Cs?",
reference="termsConfidence", header=["Level", "US", "DE", "UK"],
addRows=[" Understood nothing"], reorder = [0,4,2,3,1])
t3 = expandCoded(joined.termsUnclearCoded, ',\W?')
with open('termsUnclearCoded.txt', 'r') as f:
lines = f.readlines()
mapping = dict()
for key in t3.columns:
potentialLines = [x.strip()[len(key)+2:] for x in lines if x.strip().startswith(key)]
mapping[u"termsUnclearCoded_%s" %key] = u"%s" %potentialLines[0] if potentialLines else u""
descriptions.update(mapping)
t3.columns = ["termsUnclearCoded_%s" %i for i in t3.columns]
t3.columns
print "\n".join(["%s : %s" %(key, mapping[key]) for key in t3.columns])
joined = pd.concat([joined, t3],axis=1)
tt = joined.groupby('startlanguage')[t3.columns]
freq = joined['startlanguage'].value_counts()
freq = freq[[k for k in freq.keys() if freq[k] > 0]]
ttt = tt.sum().T.divide(freq)
t = h.Table("lccc")
t.setHeader(["Code", "DE", "UK", "US"])
t.fromNPArray(ttt.values*100, "$%.1f%%$", rowdesc=[descriptions[x] for x in t3.columns])
t.setCaption("Thematic analysis of understanding issues of the T&Cs of the participants.")
t.reference = "termsUnclear"
t.writeLatexToFile(path='tab/')
t
descriptiveValuePercentages(joined.groupby('startlanguage').compShare,
caption=descriptions['compShare'],
reference="compShare", header=["Frequency", "US", "DE", "UK"], reorder = [1, 0])
descriptiveValuePercentages(joined.groupby('startlanguage').compCardCheque,
caption=descriptions['compCardCheque'],
reference="compCardCheque", header=["Frequency", "US", "DE", "UK"], reorder = [1, 0])
toDraw = ["compRememberCoded", "compOnlineBankingCoded", "compLiableCoded", "compNegligenceCoded"]
t3s = []
for question in toDraw:
t3 = expandCoded(joined[question], ',\W?')
with open('comprehensionCodes-clean.txt', 'r') as f:
lines = f.readlines()
mapping = dict()
i = 0 if question != "compNegligenceCoded" else 1
for key in t3.columns:
potentialLines = [x.strip()[len(key)+2:] for x in lines if x.strip().startswith(key)]
mapping[u"%s_%s" %(question,key)] = u"%s" %potentialLines[i] if potentialLines else u""
descriptions.update(mapping)
t3.columns = ["%s_%s" %(question,i) for i in t3.columns]
t3s.append(t3)
for t3 in t3s:
joined = pd.concat([joined, t3],axis=1)
for t3 in t3s:
print "\n".join(["%s : %s" %(key, descriptions[key]) for key in t3.columns])
for t3, question in zip(t3s, toDraw):
tt = joined.groupby('startlanguage')[t3.columns]
freq = joined['startlanguage'].value_counts()
freq = freq[[k for k in freq.keys() if freq[k] > 0]]
ttt = tt.sum().T.divide(freq)
t = h.Table("lccc")
t.setHeader(["Code", "DE", "UK", "US"])
t.fromNPArray(ttt.values*100, "$%.1f%%$", rowdesc=[descriptions[x] for x in t3.columns])
t.setCaption("Thematic analysis of the answers to the comprehension question: \"%s\"" %descriptions[question[:-5]])
t.reference = question[:-5]
t.writeLatexToFile(path='tab/')
t.display()
ReplacementPhishingReasonRevCoded = {"C6" : "C5", "C8" : "C3", "C11" : "C1", "C7" : "C1", "C10" : "C2", "C9" : "C2",
"Code3" : "Code11, Code5", "Code7" : "Code2", "Code8" : "Code12, Code11, Code6",
"Code9" : "Code6, Code12", "Code10" : "Code5"}
ReplacementPhishingReasonCoded = {"C9" : "C2", "C5" : "C4", "C12" : "C2", "C13" : "C2", "C11" : "C14", "C15" : "C16",
"C10" : "C4", "C17" : "C16, C14", "CODE3" : "CODE2", "CODE6" : "CODE7", "CODE8" : "CODE7",
"CODE9" : "CODE7", "CODE10" :"CODE7, CODE2", "CODE11" : "CODE7", "12" : "CODE2, CODE4",
"CODE14" : "CODE7", "13" : "C0", "CODE16" : "CODE15", "C7" : "C2, C3", "CODE5" : "CODE1"}
ReplacementLossReasonCoded = {"C2" : "C1", "C7" : "C4", "C8" : "C4", "C11" : "C9", "CODE3" : "CODE2", "CODE5" : "CODE4",
"10" : "CODE4", "CODE11" : "CODE7", "CODe8": "CODE6", "CODE12": "CODE9"}
ReplacementLossReasonRevCoded = {"C6" : "C3", "C8" : "C1", "C9" : "C1", "Code2" : "Code3", "Code5" : "Code3"}
toFix = [("phishingReasonCoded", ReplacementPhishingReasonCoded),
("phishingReasonRevCoded", ReplacementPhishingReasonRevCoded),
("lossReasonCoded", ReplacementLossReasonCoded), ("lossReasonRevCoded", ReplacementLossReasonRevCoded)]
t3s = []
with open('secondary-coding-codebook.txt', 'r') as f:
content = f.read().split("---")[1:]
for question, fulltext in zip(zip(*toFix)[0],content):
t3 = expandCoded(joined[question], ',\W?')
lines = fulltext.splitlines()
mapping = dict()
for key in t3.columns:
potentialLines = [x.strip()[len(key)+2:] for x in lines if x.strip().startswith(key)]
mapping[u"%s_%s" %(question,key)] = u"%s" %potentialLines[0] if potentialLines else u""
#print mapping
descriptions.update(mapping)
t3.columns = ["%s_%s" %(question,i) for i in t3.columns]
t3s.append(t3)
for t3 in t3s:
joined = pd.concat([joined, t3],axis=1)
#We can draw 8 Tables, for each the positive reasons and negative reasons why people decide to think one way or another.
columns = [x for x in descriptions.keys() if x.startswith("%sReason%sCoded" %("loss", ""))]
columns
joined[(joined.lossMoneyBack == True) & (joined.lossReasonCoded_CODE1 > 0)]
for scenario in ["phishing", "loss"]:
for foragainst in [True, False]:
for beforeAfter in ["", "Rev"]:
columns = [x for x in descriptions.keys() if x.startswith("%sReason%sCoded" %(scenario, beforeAfter))]
columns = sorted(columns,key=lambda x: descriptions[x])
data = joined[joined["%sMoneyBack%s" %(scenario,beforeAfter)] == foragainst].groupby('startlanguage')[columns]
freq = joined[joined["%sMoneyBack%s" %(scenario,beforeAfter)] == foragainst]['startlanguage'].value_counts()
freq = freq[[k for k in freq.keys() if freq[k] > 0]]
ttt = data.sum().T[data.sum().sum() > 0].divide(freq)
print scenario, foragainst, beforeAfter
print ttt
t = h.Table("lccc")
t.setHeader(["Code", "DE", "UK", "US"])
t.fromNPArray(ttt.values*100, "$%.1f%%$", rowdesc=[descriptions[x] for x in ttt.T.columns])
t.setCaption("Thematic analysis of the answers%s in support of reimbursement in scenario %s%s." %(
(" not" if not foragainst else ""), scenario,
", after the participants have seen the T&Cs." if beforeAfter else ""))
t.reference = "%sReason%s%s" %(scenario, "For" if foragainst else "Against", beforeAfter)
t.writeLatexToFile(path='tab/')
t.display()
ttt