library(tidyverse)
library(here)
library(colorout)
library(dplyr)
library(flextable)
library(ggstatsplot)
library(extrafont)
library(stringr)
library(flextable)
library(officer)
For Admixture we can load the package in the cluster.
get clone https://github.com/NovembreLab/admixture.git
# check it
./admixture/releases/admixture_linux-1.3.0/admixture -h
Export path
We created 3 sets of SNPs. One with intergenic SNPs that we called neutral. Then, one with pruning using r2 of 0.01 and other with r2 of 0.1. They have different number of SNPs but the same 237 samples. ’
## BEN
## CAM
## CHA
## GEL
## HAI
## HAN
## HOC
## HUN
## INJ
## INW
## JAF
## KAC
## KAG
## KAN
## KAT
## KLP
## KUN
## LAM
## MAT
## OKI
## QNC
## SON
## SSK
## SUF
## SUU
## TAI
## UTS
## YUN
Some of these sampling localities are very close to each other but we will consider to count the total number of sample groups.
## 28
First we need to test if our command will create the directories we
want. We can use echo
instead of mkdir
to test
it.
Change echo to mkdir
and create all directories. First
option with seq using -w flag to have leading zeros (check
“man seq
”, -w, –equal-width).
Next we have to create the script to run Admixture. First, lets test the loop using k=1 to k=5 and 5 runs for each k. Note that for runs we use seq -w (we get run01, run02, etc., when we have 10 runs). For K, we don’t want the leading zeros, so we don’t use the -w flag.
## run= 1 k= 1 seed= 1
## run= 1 k= 2 seed= 1
## run= 2 k= 1 seed= 2
## run= 2 k= 2 seed= 2
Check the Admixture options.
admixture --help
**** ADMIXTURE Version 1.3.0 ****
**** Copyright 2008-2015 ****
**** David Alexander, Suyash Shringarpure, ****
**** John Novembre, Ken Lange ****
**** ****
**** Please cite our paper! ****
**** Information at www.genetics.ucla.edu/software/admixture ****
ADMIXTURE basic usage: (see manual for complete reference)
% admixture [options] inputFile K
where:
K is the number of populations; and
inputFile may be:
- a PLINK .bed file
- a PLINK "12" coded .ped file
Output will be in files inputBasename.K.Q, inputBasename.K.P
General options:
-jX : do computation on X threads
--seed=X : use random seed X for initialization
Algorithm options:
-m=
--method=[em|block] : set method. block is default
-a=
--acceleration=none |
sqs<X> |
qn<X> : set acceleration
Convergence criteria:
-C=X : set major convergence criterion (for point estimation)
-c=x : set minor convergence criterion (for bootstrap and CV reestimates)
Bootstrap standard errors:
-B[X] : do bootstrapping [with X replicates]
Generate script for Admixture. We will use 500 bootstraps, and cv=10.
admixture file.bed $name -j10 # we will run with the default settings (200 bootstraps) -j10 means we will run with 10 threads, 200 bootstraps
admixture file.bed $name -j20 --cv=10 --B1000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads
Test loop to make script for dsq
.
# 2000 bootstraps and cv 10
for run in $(seq 1); do
for k in $(seq 1 2); do
echo "cd /gpfs/ycga/project/caccone/lvc26/admixture/neutral/run$run; \
export PATH=$PATH:/gpfs/ycga/project/caccone/lvc26/albo_manuscript/admixture/admixture/releases/admixture_linux-1.3.0/; \
admixture -s $run --cv=10 -B2000 -j20 /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/neutral.bed $k | tee log_k$k.txt"
done
done | head
## cd /gpfs/ycga/project/caccone/lvc26/admixture/neutral/run1; export PATH=/Users/lucianocosme/opt/miniconda3/bin:/Users/lucianocosme/opt/miniconda3/bin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Library/TeX/texbin:/usr/local/go/bin:/opt/X11/bin:/Library/Apple/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/Users/lucianocosme/Applications/quarto/bin:/usr/texbin:/Applications/RStudio.app/Contents/Resources/app/bin/postback:/gpfs/ycga/project/caccone/lvc26/albo_manuscript/admixture/admixture/releases/admixture_linux-1.3.0/; admixture -s 1 --cv=10 -B2000 -j20 /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/neutral.bed 1 | tee log_k1.txt
## cd /gpfs/ycga/project/caccone/lvc26/admixture/neutral/run1; export PATH=/Users/lucianocosme/opt/miniconda3/bin:/Users/lucianocosme/opt/miniconda3/bin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Library/TeX/texbin:/usr/local/go/bin:/opt/X11/bin:/Library/Apple/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/Users/lucianocosme/Applications/quarto/bin:/usr/texbin:/Applications/RStudio.app/Contents/Resources/app/bin/postback:/gpfs/ycga/project/caccone/lvc26/albo_manuscript/admixture/admixture/releases/admixture_linux-1.3.0/; admixture -s 1 --cv=10 -B2000 -j20 /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/neutral.bed 2 | tee log_k2.txt
Now we can generate the full script for dsq.
We tested Admixture with several runs for each data set. Admixture always returned k=5 or k=6 as the number of ancestral populations. We will do one more run with the same SNP sets we used for all algorithms.
for run in $(seq 1); do
for k in $(seq 1 25); do
echo "cd /gpfs/ycga/project/caccone/lvc26/admixture/neutral/run$run; \
export PATH=$PATH:/gpfs/ycga/project/caccone/lvc26/albo_manuscript/admixture/admixture/releases/admixture_linux-1.3.0/; \
admixture -s $run --cv=10 -B1000 -j20 /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/neutral.bed $k | tee log_k$k.txt"
done
done > dsq1.txt
Generate the dsq batch files on the cluster.
module load dSQ/1.05
# make script
# https://docs.ycrc.yale.edu/clusters-at-yale/job-scheduling/dsq/
#
# create dsq file
dsq \
--job-file dsq1.txt \
--output /gpfs/ycga/project/caccone/lvc26/admixture/neutral/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 1g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--job-name admixtureN \
--batch-file admixture.sh \
--partition=scavenge
#
Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.
We tested admixture with several runs for each data set. Admixture always returned k=5 or k=6 as the number of ancestral populations. We will do one more run with the same SNP sets we used for all algorithms
for run in $(seq 1); do
for k in $(seq 1 25); do
echo "cd /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.01/run$run; \
export PATH=$PATH:/gpfs/ycga/project/caccone/lvc26/albo_manuscript/admixture/admixture/releases/admixture_linux-1.3.0/; \
admixture -s $run --cv=10 -B1000 -j20 /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/r2_0.01.bed $k | tee log_k$k.txt"
done
done > dsq1.txt
Generate the dsq batch files on the cluster.
module load dSQ/1.05
# make script
# https://docs.ycrc.yale.edu/clusters-at-yale/job-scheduling/dsq/
#
# create dsq file
dsq \
--job-file dsq1.txt \
--output /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.01/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 1g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--job-name admixture_r2_0.01 \
--batch-file admixture.sh \
--partition=scavenge
#
We tested admixture with several runs for each data set. Admixture always returned k=5 or k=6 as the number of ancestral populations. We will do one more run with the same SNP sets we used for all algorithms
for run in $(seq 1); do
for k in $(seq 1 25); do
echo "cd /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.1/run$run; \
export PATH=$PATH:/gpfs/ycga/project/caccone/lvc26/albo_manuscript/admixture/admixture/releases/admixture_linux-1.3.0/; \
admixture -s $run --cv=10 -B1000 -j20 /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/r2_0.1.bed $k | tee log_k$k.txt"
done
done > dsq1.txt
Generate the dsq batch files on the cluster.
module load dSQ/1.05
# make script
# https://docs.ycrc.yale.edu/clusters-at-yale/job-scheduling/dsq/
#
# create dsq file
dsq \
--job-file dsq1.txt \
--output /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 1g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--job-name admixture_r2_0.1 \
--batch-file admixture.sh \
--partition=scavenge
#
If it runs without problems, we can submit all jobs.
Check the jobs status.
# create a directory for the logs
cd /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.1
mkdir logs
sbatch admixture.sh
# Submitted batch job 6272864
# check status
dsqa -j 6272864 # PREEMPTED 5 11,16,18-20
Run autopsy once it is done.
dsqa -j 6272864 -f dsq1.txt -s PREEMPTED > rerun_jobs.txt; wc -l rerun_jobs.txt
# re submit the 30 jobs
dsq \
--job-file rerun_jobs.txt \
--mem-per-cpu 1g \
--cpus-per-task=10 \
--time 120:00:00 \
--mail-type END \
--partition=week \
--job-name admixture2 \
--output /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.1/logs/admix-%A_%3a-%N.txt \
--batch-file admixture2.sh \
--submit #job 6350311
# check status
dsqa -j 6350311
Collect the cross validation information from all the log files. We have to do it for each run. We need to get the CV and the K from each log file.
# navigate to the data directory
cd /gpfs/ycga/project/caccone/lvc26/admixture/neutral
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
Make loop to get summary
cd /gpfs/ycga/project/caccone/lvc26/admixture/neutral
#
for i in $(ls -1 run1/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | sed 's|_||' | awk '{print $1, $3, $4}'
done > cross_validation_neutral_set.txt
Download the data
rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/admixture/neutral/cross_validation_neutral_set.txt /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/admixture/neutral
Import the cross validation values into R
cross_val_default <-
read_delim(
here(
"output", "populations", "admixture", "neutral", "cross_validation_neutral_set.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "cin"
) |>
rename(
run = 1,
k = 2,
cv = 3
)
Lets make cv plot for the default run
cross_val_default |>
ggplot() +
geom_line(
aes(
x = k,
y = cv
),
linewidth = .75,
color = "magenta"
) +
labs(
x = "K",
y = "Cross-validation error",
title = "Admixture Cross-validation neutral SNPs",
subtitle = "1000 bootstraps and cv = 10 ",
caption = "algorithm runs for choices of K ranging from 1 to 25"
) +
hrbrthemes::theme_ipsum(
base_family = "",
axis_text_size = 12,
axis_title_size = 14,
plot_margin = margin(10, 10, 10, 10),
grid = TRUE,
grid_col = "#fabbe2"
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2,
),
legend.title = element_text(
size = 14,
face = "bold"
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
)
)
#
# save the plot
ggsave(
here(
"output", "populations","figures", "admixuture_default_run_k1_k25.pdf"
),
width = 8,
height = 6,
units = "in"
)
Now we can find the K with the lowest cross-validation error
cross_val_default |>
summarize(
LowestCVerror = min(
cv,
na.rm = TRUE
)
) -> cv_min_default
cross_val_default |>
slice(
which.min(
cv
)
) |>
select(
run, k, cv
) |>
rename(
LowestCVerror = 3
) -> k_with_lowest_cv_default
k_with_lowest_cv_default
## # A tibble: 1 × 3
## run k LowestCVerror
## <chr> <int> <dbl>
## 1 1 5 0.699
k=5
Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file.
Make loop to get the summary
cd /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.01
#
for i in $(ls -1 run1/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | sed 's|_||' | awk '{print $1, $3, $4}'
done > cross_validation_r2_0.01_set.txt
Download the data
rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/admixture/r2_0.01/cross_validation_r2_0.01_set.txt /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/admixture/r2_0.01
Import the cross validation values into R
cross_val_default <-
read_delim(
here(
"output", "populations", "admixture", "r2_0.01", "cross_validation_r2_0.01_set.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "cin"
) |>
rename(
run = 1,
k = 2,
cv = 3
)
Lets make cv plot for the default run
# make plot
cross_val_default |>
ggplot() +
geom_line(
aes(
x = k,
y = cv
),
linewidth = .75,
color = "magenta"
) +
labs(
x = "K",
y = "Cross-validation error",
title = "Admixture Cross-validation r2_0.01 SNPs",
subtitle = "1000 bootstraps and cv = 10 ",
caption = "algorithm runs for choices of K ranging from 1 to 25"
) +
hrbrthemes::theme_ipsum(
base_family = "",
axis_text_size = 12,
axis_title_size = 14,
plot_margin = margin(10, 10, 10, 10),
grid = TRUE,
grid_col = "#fabbe2"
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2,
),
legend.title = element_text(
size = 14,
face = "bold"
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
)
)
#
# save the plot
ggsave(
here(
"output", "populations", "figures", "admixuture_r2_0.01_run_k1_k25.pdf"
),
width = 8,
height = 6,
units = "in"
)
Now we can find the K with the lowest cross-validation error
cross_val_default |>
summarize(
LowestCVerror = min(
cv,
na.rm = TRUE
)
) -> cv_min_default
cross_val_default |>
slice(
which.min(
cv
)
) |>
select(
run, k, cv
) |>
rename(
LowestCVerror = 3
) -> k_with_lowest_cv_default
k_with_lowest_cv_default
## # A tibble: 1 × 3
## run k LowestCVerror
## <chr> <int> <dbl>
## 1 1 5 0.746
k=5
Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file.
Make loop to get the summary
cd /gpfs/ycga/project/caccone/lvc26/admixture/r2_0.1
#
for i in $(ls -1 run1/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | sed 's|_||' | awk '{print $1, $3, $4}'
done > cross_validation_r2_0.1_set.txt
Download the data
rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/admixture/r2_0.1/cross_validation_r2_0.1_set.txt /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/admixture/r2_0.1
Import the cross validation values into R
cross_val_default <-
read_delim(
here(
"output", "populations", "admixture", "r2_0.1", "cross_validation_r2_0.1_set.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "cin"
) |>
rename(
run = 1,
k = 2,
cv = 3
)
Lets make cv plot for the default run
# make plot
cross_val_default |>
ggplot() +
geom_line(
aes(
x = k,
y = cv
),
linewidth = .75,
color = "magenta"
) +
labs(
x = "K",
y = "Cross-validation error",
title = "Admixture Cross-validation r2_0.01 SNPs",
subtitle = "1000 bootstraps and cv = 10 ",
caption = "algorithm runs for choices of K ranging from 1 to 25"
) +
hrbrthemes::theme_ipsum(
base_family = "",
axis_text_size = 12,
axis_title_size = 14,
plot_margin = margin(10, 10, 10, 10),
grid = TRUE,
grid_col = "#fabbe2"
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2,
),
legend.title = element_text(
size = 14,
face = "bold"
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
)
)
#
# save the plot
ggsave(
here(
"output", "populations", "figures", "admixuture_r2_0.1_run_k1_k25.pdf"
),
width = 8,
height = 6,
units = "in"
)
Now we can find the K with the lowest cross-validation error
cross_val_default |>
summarize(
LowestCVerror = min(
cv,
na.rm = TRUE
)
) -> cv_min_default
cross_val_default |>
slice(
which.min(
cv
)
) |>
dplyr::select(
run, k, cv
) |>
rename(
LowestCVerror = 3
) -> k_with_lowest_cv_default
k_with_lowest_cv_default
## # A tibble: 1 × 3
## run k LowestCVerror
## <chr> <int> <dbl>
## 1 1 5 0.676
k=5
Download the data
rsync -chavzP --stats --include='*/' --include='*.Q' --exclude='*' lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/admixture/neutral/* /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/admixture/neutral
Check the downloaded data
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "admixture", "neutral", "run1", "neutral.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.349 0.251 0.0197 0.0318 0.349
## 2 0.314 0.252 0.0244 0.00540 0.405
## 3 0.321 0.225 0.0554 0.00001 0.399
## 4 0.314 0.235 0.0173 0.0558 0.379
## 5 0.300 0.244 0.00001 0.0461 0.410
## 6 0.317 0.226 0.0539 0.0105 0.393
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.348771 0.250551 0.019674 0.031750 0.349254
## 2 1002 OKI 0.313650 0.251651 0.024403 0.005396 0.404899
## 3 1003 OKI 0.321265 0.224595 0.055417 0.000010 0.398712
## 4 1004 OKI 0.313674 0.234637 0.017323 0.055846 0.378520
## 5 1005 OKI 0.299965 0.244414 0.000010 0.046103 0.409507
## 6 1006 OKI 0.317372 0.225772 0.053897 0.010453 0.392506
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.348771 0.250551 0.019674 0.031750 0.349254
## 2 1002 OKI 0.313650 0.251651 0.024403 0.005396 0.404899
## 3 1003 OKI 0.321265 0.224595 0.055417 0.000010 0.398712
## 4 1004 OKI 0.313674 0.234637 0.017323 0.055846 0.378520
## 5 1005 OKI 0.299965 0.244414 0.000010 0.046103 0.409507
## 6 1006 OKI 0.317372 0.225772 0.053897 0.010453 0.392506
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
Q_melted <- k5run1 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#0000FF",
"v4" = "#FF00FF",
"v5" = "#FF0000"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:25 with 9,047 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Download the data
rsync -chavzP --stats --include='*/' --include='*.Q' --exclude='*' lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/admixture/r2_0.01/* /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/admixture/r2_0.01
Check the downloaded data
## r2_0.01.23.Q
## r2_0.01.24.Q
## r2_0.01.25.Q
## r2_0.01.3.Q
## r2_0.01.4.Q
## r2_0.01.5.Q
## r2_0.01.6.Q
## r2_0.01.7.Q
## r2_0.01.8.Q
## r2_0.01.9.Q
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "admixture", "r2_0.01", "run1", "r2_0.01.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.317 0.0707 0.00016 0.0522 0.560
## 2 0.279 0.0959 0.0108 0.0445 0.570
## 3 0.249 0.0636 0.0118 0.0126 0.663
## 4 0.243 0.0109 0.00547 0.0332 0.707
## 5 0.232 0.00151 0.00001 0.0344 0.732
## 6 0.250 0.0409 0.0151 0.0381 0.656
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.317249 0.070658 0.000160 0.052193 0.559741
## 2 1002 OKI 0.279230 0.095881 0.010824 0.044486 0.569579
## 3 1003 OKI 0.248873 0.063590 0.011806 0.012550 0.663181
## 4 1004 OKI 0.243349 0.010864 0.005469 0.033171 0.707148
## 5 1005 OKI 0.232079 0.001506 0.000010 0.034445 0.731960
## 6 1006 OKI 0.249958 0.040899 0.015095 0.038120 0.655927
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.317249 0.070658 0.000160 0.052193 0.559741
## 2 1002 OKI 0.279230 0.095881 0.010824 0.044486 0.569579
## 3 1003 OKI 0.248873 0.063590 0.011806 0.012550 0.663181
## 4 1004 OKI 0.243349 0.010864 0.005469 0.033171 0.707148
## 5 1005 OKI 0.232079 0.001506 0.000010 0.034445 0.731960
## 6 1006 OKI 0.249958 0.040899 0.015095 0.038120 0.655927
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
Q_melted <- k5run1 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#0000FF",
"v4" = "#FF00FF",
"v5" = "#FF0000"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:25 with 20,931 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Download the data
rsync -chavzP --stats --include='*/' --include='*.Q' --exclude='*' lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/admixture/r2_0.1/* /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/admixture/r2_0.1
Check the downloaded data
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2777077 148.4 4261935 227.7 NA 4261935 227.7
## Vcells 4848626 37.0 10146329 77.5 32768 7109557 54.3
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "admixture", "r2_0.1", "run1", "r2_0.1.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.248 0.129 0.00001 0.00912 0.614
## 2 0.220 0.145 0.00001 0.0184 0.616
## 3 0.181 0.0975 0.00001 0.00001 0.722
## 4 0.147 0.0112 0.00001 0.00516 0.837
## 5 0.105 0.00001 0.00001 0.00001 0.895
## 6 0.171 0.0981 0.00001 0.00001 0.731
The fam file
fam_file <- here("output", "populations", "snps_sets", "r2_0.1.fam")
# Read the .fam file
fam_data <- read.table(
fam_file,
header = FALSE,
col.names = c(
"FamilyID",
"IndividualID",
"PaternalID",
"MaternalID",
"Sex",
"Phenotype"
)
)
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.247788 0.128696 1e-05 0.009116 0.614391
## 2 1002 OKI 0.220143 0.145400 1e-05 0.018377 0.616069
## 3 1003 OKI 0.180728 0.097481 1e-05 0.000010 0.721772
## 4 1004 OKI 0.146729 0.011211 1e-05 0.005160 0.836890
## 5 1005 OKI 0.105331 0.000010 1e-05 0.000010 0.894639
## 6 1006 OKI 0.171413 0.098054 1e-05 0.000010 0.730513
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.247788 0.128696 1e-05 0.009116 0.614391
## 2 1002 OKI 0.220143 0.145400 1e-05 0.018377 0.616069
## 3 1003 OKI 0.180728 0.097481 1e-05 0.000010 0.721772
## 4 1004 OKI 0.146729 0.011211 1e-05 0.005160 0.836890
## 5 1005 OKI 0.105331 0.000010 1e-05 0.000010 0.894639
## 6 1006 OKI 0.171413 0.098054 1e-05 0.000010 0.730513
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
Q_melted <- k5run1 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#0000FF",
"v4" = "#FF00FF",
"v5" = "#FF0000"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:25 with 57,780 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
Q_melted <- k5run1 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"v1" = "#00FF00", # red
"v2" = "#FFFF00", # blue
"v3" = "#0000FF", # green
"v4" = "#FF00FF", # yellow
"v5" = "#FF0000" # magenta
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:25 with 57,780 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))