@phdthesis{discovery10163568,
           pages = {1--436},
           title = {Unsupervised learning methods for identifying and evaluating disease clusters in electronic health records},
          school = {UCL (University College London)},
            year = {2023},
           month = {January},
            note = {Copyright {\copyright} The Author 2022. Original content in this thesis is licensed under the terms of the Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) Licence (https://creativecommons.org/licenses/by-nc/4.0/). Any third-party copyright material present remains the property of its respective owner(s) and is licensed under its existing terms. Access may initially be restricted at the author's request.},
        abstract = {Introduction 
Clustering algorithms are a class of algorithms that can discover groups of observations in 
complex data and are often used to identify subtypes of heterogeneous diseases in electronic 
health records (EHR). Evaluating clustering experiments for biological and clinical significance is 
a vital but challenging task due to the lack of consensus on best practices. As a result, the 
translation of findings from clustering experiments to clinical practice is limited. 
Aim 
The aim of this thesis was to investigate and evaluate approaches that enable the evaluation of 
clustering experiments using EHR. 
Methods 
We conducted a scoping review of clustering studies in EHR to identify common evaluation 
approaches. We systematically investigated the performance of the identified approaches using 
a cohort of Alzheimer's Disease (AD) patients as an exemplar comparing four different 
clustering methods (K-means, Kernel K-means, Affinity Propagation and Latent Class 
Analysis.). Using the same population, we developed and evaluated a method (MCHAMMER) 
that tested whether clusterable structures exist in EHR. To develop this method we tested 
several cluster validation indexes and methods of generating null data to see which are the best 
at discovering clusters. In order to enable the robust benchmarking of evaluation approaches, 
we created a tool that generated synthetic EHR data that contain known cluster labels across a 
range of clustering scenarios. 
Results 
Across 67 EHR clustering studies, the most popular internal evaluation metric was comparing 
cluster results across multiple algorithms (30\% of studies). We examined this approach 
conducting a clustering experiment on AD patients using a population of 10,065 AD patients and 
21 demographic, symptom and comorbidity features. K-means found 5 clusters, Kernel K means found 2 clusters, Affinity propagation found 5 and latent class analysis found 6. K-means 
4
was found to have the best clustering solution with the highest silhouette score (0.19) and was 
more predictive of outcomes. The five clusters found were: typical AD (n=2026), non-typical AD 
(n=1640), cardiovascular disease cluster (n=686), a cancer cluster (n=1710) and a cluster of 
mental health issues, smoking and early disease onset (n=1528), which has been found in 
previous research as well as in the results of other clustering methods. We created a synthetic 
data generation tool which allows for the generation of realistic EHR clusters that can vary in 
separation and number of noise variables to alter the difficulty of the clustering problem. We 
found that decreasing cluster separation did increase cluster difficulty significantly whereas 
noise variables increased cluster difficulty but not significantly. To develop the tool to assess 
clusters existence we tested different methods of null dataset generation and cluster validation 
indices, the best performing null dataset method was the min max method and the best 
performing indices we Calinksi Harabasz index which had an accuracy of 94\%, Davies Bouldin 
index (97\%) silhouette score ( 93\%) and BWC index (90\%). We further found that when clusters 
were identified using the Calinski Harabasz index they were more likely to have significantly 
different outcomes between clusters. Lastly we repeated the initial clustering experiment, 
comparing 10 different pre-processing methods. The three best performing methods were RBF 
kernel (2 clusters), MCA (4 clusters) and MCA and PCA (6 clusters). The MCA approach gave 
the best results highest silhouette score (0.23) and meaningful clusters, producing 4 clusters; 
heart and circulatory( n=1379), early onset mental health (n=1761), male cluster with memory 
loss (n = 1823), female with more problem (n=2244).
Conclusion 
We have developed and tested a series of methods and tools to enable the evaluation of EHR 
clustering experiments. We developed and proposed a novel cluster evaluation metric and 
provided a tool for benchmarking evaluation approaches in synthetic but realistic EHR.},
             url = {https://discovery.ucl.ac.uk/id/eprint/10163568/},
          author = {Alexander, Nonie}
}