library(tidyverse)
library(here)
library(dplyr)
library(ggplot2)
library(colorout)
library(extrafont)
library(scales)
library(stringr)
library(flextable)
library(ggtext)
library(RColorBrewer)
library(ggrepel)
library(grid)
library(forcats)
library(officer)
library(ggstatsplot)
I will use the PopLDdecay for the estimates
https://github.com/BGI-shenzhen/PopLDdecay and the manuscript DOI:10.1093/bioinformatics/bty875
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2419409 129.3 4551360 243.1 NA 4551360 243.1
## Vcells 4078901 31.2 8388608 64.0 32768 6637402 50.7
Mosquitoes removed due excess heterozygosity
## KAT 9
## KAT 12
Mosquitoes removed duo to relatedness
## #FID IID
## OKI 1010
## KAN 1327
Create a list of individuals to remove
Create file
plink2 \
--allow-extra-chr \
--remove output/populations/linkage/remove.txt \
--bfile output/populations/file3 \
--make-bed \
--out output/populations/linkage/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/populations/linkage/ld1.log
## 239 samples (30 females, 69 males, 140 ambiguous; 239 founders) loaded from
## 104139 variants loaded from output/populations/file3.bim.
## --remove: 235 samples remaining.
## 235 samples (30 females, 65 males, 140 ambiguous; 235 founders) remaining after
## BEN 12
## CAM 12
## CHA 12
## GEL 2
## HAI 12
## HAN 4
## HOC 7
## HUN 12
## INJ 11
## INW 4
## JAF 2
## KAC 6
## KAG 12
## KAN 11
## KAT 8
## KLP 4
## KUN 4
## LAM 9
## MAT 12
## OKI 11
## QNC 12
## SON 3
## SSK 12
## SUF 6
## SUU 6
## TAI 8
## UTS 12
## YUN 9
Create list of populations
awk '{print $1}' output/populations/linkage/ld1.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 6 {print}' | awk '{print $1}' > output/populations/linkage/pops_4ld.txt;
head -n100 output/populations/linkage/pops_4ld.txt
## BEN
## CAM
## CHA
## HAI
## HOC
## HUN
## INJ
## KAC
## KAG
## KAN
## KAT
## LAM
## MAT
## OKI
## QNC
## SSK
## SUF
## SUU
## TAI
## UTS
## YUN
Import the .bim file with the SNPs to create a new chromosomal scale.
# ____________________________________________________________________________
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"output", "populations", "linkage", "ld1.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
## Scaffold SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1.1 AX-583033226 0 161729 A G
## 2 1.1 AX-583033342 0 315059 C G
## 3 1.1 AX-583033370 0 330057 G A
## 4 1.1 AX-583035194 0 330265 A G
## 5 1.1 AX-583033387 0 331288 C T
## 6 1.1 AX-583035211 0 345197 C T
Separate the tibbles into each chromosome
# ____________________________________________________________________________
# separate the SNP data per chromosome ####
# chr1
chr1_snps <-
snps |>
filter(
str_detect(
Scaffold, "^1."
)
) |> # here we get only Scaffold rows starting with 1
as_tibble() # save as tibble
#
# chr2
chr2_snps <-
snps |>
filter(
str_detect(
Scaffold, "^2."
)
) |>
as_tibble()
#
# chr3
chr3_snps <-
snps |>
filter(
str_detect(
Scaffold, "^3."
)
) |>
as_tibble()
Import the file with sizes of each scaffold.
# ____________________________________________________________________________
# import the file with the scaffold sizes ####
sizes <-
read_delim(
here(
"data", "genome", "scaffold_sizes.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "cd"
)
#
# set column names
colnames(
sizes
) <- c(
"Scaffold", "Size"
)
# ____________________________________________________________________________
# create new column with the chromosome number ####
sizes <-
sizes |>
mutate(
Chromosome = case_when( # we use mutate to create a new column called Chromosome
startsWith(
Scaffold, "1"
) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
startsWith(
Scaffold, "2"
) ~ "2",
startsWith(
Scaffold, "3"
) ~ "3"
)
) |>
arrange(
Scaffold
) # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)
## # A tibble: 6 × 3
## Scaffold Size Chromosome
## <chr> <dbl> <chr>
## 1 1.1 351198 1
## 2 1.10 11939576 1
## 3 1.100 3389100 1
## 4 1.101 470438 1
## 5 1.102 2525157 1
## 6 1.103 150026 1
Create new scale. Get the scaffolds for each chromosome.
# ____________________________________________________________________________
# separate the scaffold sizes tibble per chromosome ####
# chr1
chr1_scaffolds <-
sizes |>
filter(
str_detect(
Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
)
) |>
as_tibble()
#
# chr2
chr2_scaffolds <-
sizes |>
filter(
str_detect(
Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
)
) |>
as_tibble()
#
# # chr3
chr3_scaffolds <-
sizes |>
filter(
str_detect(
Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
)
) |>
as_tibble()
Create a scale for each chromosome.
# ____________________________________________________________________________
# create a new scale for each chromosome ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
0 # we create a new column with zeros
for (i in 2:nrow(
chr1_scaffolds
)
) { # loop to start on second line
chr1_scaffolds$overall_size_before_bp[i] <- # set position on the scale
chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
chr2_scaffolds
)
) {
chr2_scaffolds$overall_size_before_bp[i] <-
chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
chr3_scaffolds
)
) {
chr3_scaffolds$overall_size_before_bp[i] <-
chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
1]
}
Merge the data frames scaffolds and SNPs.
# ____________________________________________________________________________
# merge the data sets using the tidyverse function left_join ####
# chr1
chr1_scale <-
chr1_snps |> # create data frame for each chromosome, get chr1_snps
left_join( # use lef_join function to merge it with chr1_scaffolds
chr1_scaffolds,
by = "Scaffold"
) |> # set column to use for merging (Scaffold in this case)
na.omit() |> # remove NAs, we don't have SNPs in every scaffold
mutate(
midPos_fullseq = as.numeric(
Position
) + # make new columns numeric
as.numeric(
overall_size_before_bp
)
)
#
# chr2
chr2_scale <-
chr2_snps |>
left_join(
chr2_scaffolds,
by = "Scaffold"
) |>
na.omit() |>
mutate(
midPos_fullseq = as.numeric(
Position
) +
as.numeric(
overall_size_before_bp
)
)
#
# chr3
chr3_scale <-
chr3_snps |>
left_join(
chr3_scaffolds,
by = "Scaffold"
) |>
na.omit() |>
mutate(
midPos_fullseq = as.numeric(
Position
) +
as.numeric(
overall_size_before_bp
)
)
Merge all chromosome scales.
# ____________________________________________________________________________
# merge the data sets, and select only the columns we need ####
chroms <- rbind(
chr1_scale, chr2_scale, chr3_scale
) |>
dplyr::select(
Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
)
# check it
head(chroms)
## # A tibble: 6 × 6
## Chromosome SNP Cm midPos_fullseq Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583033226 0 161729 A G
## 2 1 AX-583033342 0 315059 C G
## 3 1 AX-583033370 0 330057 G A
## 4 1 AX-583035194 0 330265 A G
## 5 1 AX-583033387 0 331288 C T
## 6 1 AX-583035211 0 345197 C T
Save the new .bim file
# ____________________________________________________________________________
# save the new bim file with a new name, I added "B" ####
write.table(
chroms,
file = here(
"output", "populations", "linkage", "ld1B.bim"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Rename the .bim files
# change the name of the first .bim file, for example, append _backup.bim, and then replace the original file
mv output/populations/linkage/ld1.bim output/populations/linkage/ld1_backup.bim;
# than change the new bim we create to the original name (do it only once, otherwise it will mess up)
mv output/populations/linkage/ld1B.bim output/populations/linkage/ld1.bim
Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.
plink2 \
--bfile output/populations/linkage/ld1 \
--make-bed \
--out output/populations/linkage/test01;
# then we remove the files
rm output/populations/linkage/test01.*
## PLINK v2.00a3.3 64-bit (3 Jun 2022) www.cog-genomics.org/plink/2.0/
## (C) 2005-2022 Shaun Purcell, Christopher Chang GNU General Public License v3
## Logging to output/populations/linkage/test01.log.
## Options in effect:
## --bfile output/populations/linkage/ld1
## --make-bed
## --out output/populations/linkage/test01
##
## Start time: Sun Oct 8 14:53:48 2023
## 32768 MiB RAM detected; reserving 16384 MiB for main workspace.
## Using up to 12 threads (change this with --threads).
## 235 samples (30 females, 65 males, 140 ambiguous; 235 founders) loaded from
## output/populations/linkage/ld1.fam.
## 104139 variants loaded from output/populations/linkage/ld1.bim.
## Note: No phenotype data present.
## Writing output/populations/linkage/test01.fam ... done.
## Writing output/populations/linkage/test01.bim ... done.
## Writing output/populations/linkage/test01.bed ... 0%62%done.
## End time: Sun Oct 8 14:53:48 2023
No warnings from Plink2. Now, we can go ahead with our analysis.
We will use MAF 0.005 for each chromosome (default from PopLDdecay) Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2556968 136.6 4548762 243.0 NA 4548762 243.0
## Vcells 4432572 33.9 10825744 82.6 32768 10752723 82.1
We can estimate LD for each chromosome separated.
Import the .bim file with the SNPs to create a new chromosomal scale.
# ____________________________________________________________________________
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"output", "populations", "linkage", "ld1.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Chromosome", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583033226 0 161729 A G
## 2 1 AX-583033342 0 315059 C G
## 3 1 AX-583033370 0 330057 G A
## 4 1 AX-583035194 0 330265 A G
## 5 1 AX-583033387 0 331288 C T
## 6 1 AX-583035211 0 345197 C T
Separate the tibbles into each chromosome.
# ____________________________________________________________________________
# separate the SNP data per chromosome ####
# chr1
chr1_snps <-
snps |>
filter(Chromosome == 1) |> # here we get only Chromosome rows starting with 1
as_tibble() # save as tibble
#
# chr2
chr2_snps <-
snps |>
filter(Chromosome == 2) |>
as_tibble()
#
# chr3
chr3_snps <-
snps |>
filter(Chromosome == 3) |>
as_tibble()
We have objects with the SNPs for each chromosome
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583033226 0 161729 A G
## 2 1 AX-583033342 0 315059 C G
## 3 1 AX-583033370 0 330057 G A
## 4 1 AX-583035194 0 330265 A G
## 5 1 AX-583033387 0 331288 C T
## 6 1 AX-583035211 0 345197 C T
Filter the data by chromosome
Chromosome 1
write.table(
chr1_snps$SNP,
file = here(
"output", "populations", "linkage", "chr1_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Chromosome 2
write.table(
chr2_snps$SNP,
file = here(
"output", "populations", "linkage", "chr2_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Chromosome 3
write.table(
chr3_snps$SNP,
file = here(
"output", "populations", "linkage", "chr3_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Now create new files for each chromosome
Create dirs
# make directory
mkdir -p output/populations/linkage/chr1;
mkdir -p output/populations/linkage/chr2;
mkdir -p output/populations/linkage/chr3;
Chromosome 1
plink2 \
--allow-extra-chr \
--bfile output/populations/linkage/ld1 \
--keep-fam output/populations/linkage/pops_4ld.txt \
--extract output/populations/linkage/chr1_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/populations/linkage/chr1/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/populations/linkage/chr1/ld1.log
## 235 samples (30 females, 65 males, 140 ambiguous; 235 founders) loaded from
## 104139 variants loaded from output/populations/linkage/ld1.bim.
## --extract: 23397 variants remaining.
## --keep-fam: 212 samples remaining.
## 212 samples (28 females, 61 males, 123 ambiguous; 212 founders) remaining after
## 1044 variants removed due to allele frequency threshold(s)
## 22353 variants remaining after main filters.
Chromosome 2
plink2 \
--allow-extra-chr \
--bfile output/populations/linkage/ld1 \
--keep-fam output/populations/linkage/pops_4ld.txt \
--extract output/populations/linkage/chr2_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/populations/linkage/chr2/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/populations/linkage/chr2/ld1.log
## 235 samples (30 females, 65 males, 140 ambiguous; 235 founders) loaded from
## 104139 variants loaded from output/populations/linkage/ld1.bim.
## --extract: 43518 variants remaining.
## --keep-fam: 212 samples remaining.
## 212 samples (28 females, 61 males, 123 ambiguous; 212 founders) remaining after
## 1786 variants removed due to allele frequency threshold(s)
## 41732 variants remaining after main filters.
Chromosome 3
plink2 \
--allow-extra-chr \
--bfile output/populations/linkage/ld1 \
--keep-fam output/populations/linkage/pops_4ld.txt \
--extract output/populations/linkage/chr3_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/populations/linkage/chr3/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/populations/linkage/chr3/ld1.log
## 235 samples (30 females, 65 males, 140 ambiguous; 235 founders) loaded from
## 104139 variants loaded from output/populations/linkage/ld1.bim.
## --extract: 37224 variants remaining.
## --keep-fam: 212 samples remaining.
## 212 samples (28 females, 61 males, 123 ambiguous; 212 founders) remaining after
## 1557 variants removed due to allele frequency threshold(s)
## 35667 variants remaining after main filters.
Check the vcfs
## output/populations/linkage/chr1/ld1.vcf
## output/populations/linkage/chr2/ld1.vcf
## output/populations/linkage/chr3/ld1.vcf
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2557457 136.6 4548762 243.0 NA 4548762 243.0
## Vcells 4434440 33.9 10825744 82.6 32768 10752723 82.1
Prepare the files required by PopLDdecay
bcftools query -l output/populations/linkage/chr1/ld1.vcf > output/populations/linkage/vcf_samples.txt
bcftools query -l output/populations/linkage/chr1/ld1.vcf | cut -d'_' -f1 | sort | uniq > output/populations/linkage/unique_pops_from_vcf.txt
while read pop; do
grep "^${pop}_" output/populations/linkage/vcf_samples.txt > output/populations/linkage/${pop}_samples.txt
done < output/populations/linkage/unique_pops_from_vcf.txt
while read pop; do
PopLDdecay -InVCF output/populations/linkage/chr1/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/populations/linkage/chr1/${pop}_LDdecay.gz \
-SubPop output/populations/linkage/${pop}_samples.txt
done < output/populations/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/populations/linkage/chr1/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/populations/linkage/chr1/ld_decay_results_list.txt
head -100 output/populations/linkage/chr1/ld_decay_results_list.txt
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
# Define the path to the .fam file using here
fam_file_path <- here("output", "populations", "linkage", "chr1", "ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("output/populations/linkage/chr1/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
p
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3000593 160.3 5804207 310 NA 5804207 310
## Vcells 7553105 57.7 23063803 176 32768 23063803 176
while read pop; do
PopLDdecay -InVCF output/populations/linkage/chr2/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/populations/linkage/chr2/${pop}_LDdecay.gz \
-SubPop output/populations/linkage/${pop}_samples.txt
done < output/populations/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/populations/linkage/chr2/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/populations/linkage/chr2/ld_decay_results_list.txt
head -100 output/populations/linkage/chr2/ld_decay_results_list.txt
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
# Define the path to the .fam file using here
fam_file_path <- here("output", "populations", "linkage", "chr2", "ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("output/populations/linkage/chr2/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
p
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3001341 160.3 5804207 310.0 NA 5804207 310.0
## Vcells 7753626 59.2 27849300 212.5 32768 24156306 184.3
while read pop; do
PopLDdecay -InVCF output/populations/linkage/chr3/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/populations/linkage/chr3/${pop}_LDdecay.gz \
-SubPop output/populations/linkage/${pop}_samples.txt
done < output/populations/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/populations/linkage/chr3/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/populations/linkage/chr3/ld_decay_results_list.txt
head -100 output/populations/linkage/chr3/ld_decay_results_list.txt
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
# Define the path to the .fam file using here
fam_file_path <- here("output", "populations", "linkage", "chr3", "ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("output/populations/linkage/chr3/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
p
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3002096 160.4 5804207 310.0 NA 5804207 310.0
## Vcells 7818001 59.7 27920698 213.1 32768 27920698 213.1
Create data frame with the data from each chromosome
Import the data
# Chromosome 1
chr1 <- readRDS(here(
"output", "populations", "linkage", "chr1", "chr1.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr1 = half_distance
) |>
mutate(chr1 = chr1/1000)
# Chromosome 2
chr2 <- readRDS(here(
"output", "populations", "linkage", "chr2", "chr2.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr2 = half_distance
) |>
mutate(chr2 = chr2/1000)
# Chromosome 3
chr3 <- readRDS(here(
"output", "populations", "linkage", "chr3", "chr3.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr3 = half_distance
) |>
mutate(chr3 = chr3/1000)
# Merge
distance <- merge(merge(chr1, chr2, by="population"), chr3, by="population")
# Convert data from wide to long format
data_long <- gather(distance, key = "chr", value = "value", -population)
# Make it capital
data_long$chr <- tools::toTitleCase(data_long$chr)
# Remove KAT because it has mosquitoes collected from different source and had higher linkage due it
data_long <- subset(data_long, population != "KAT")
Make one plot
# Define a custom color palette
custom_palette <- c(
"Chr1" = "#edb975",
"Chr2" = "#c1f5ff",
"Chr3" = "#ff72f3"
)
# Reordering the levels of the chr factor
data_long$chr <- factor(data_long$chr, levels = c("Chr3", "Chr2", "Chr1"))
# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))
# Plotting half_distance with borders and spaced bars
ggplot(data_long, aes(x = population, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 4) +
my_theme() +
coord_flip() +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"))
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
## population chr value
## 1 BEN Chr1 18.9
## 2 CAM Chr1 18.9
## 3 CHA Chr1 12.3
## 4 HAI Chr1 1.5
## 5 HOC Chr1 54.6
## 6 HUN Chr1 12.7
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
## population chr value Pop_City Country Latitude Longitude Region
## 1 BEN Chr1 18.9 Bengaluru India 12.9716 77.5946 Asia
## 2 CAM Chr1 18.9 Phnom Penh Cambodia 11.5564 104.9282 Asia
## 3 CHA Chr1 12.3 Chanthaburi Thailand 12.6097 102.1045 Asia
## 4 HAI Chr1 1.5 Hainan China 19.2000 109.6000 Asia
## 5 HOC Chr1 54.6 Ho Chi Minh City Vietnam 10.7626 106.6602 Asia
## 6 HUN Chr1 12.7 Hunan China 27.6253 111.8569 Asia
Add the name of the city/countries to the plot
# Creating the label with population, city, and country for the y-axis
distance2$pop_city_country_label <- paste(distance2$population, "\n", distance2$Pop_City, "\n(", distance2$Country, ")", sep = "")
# Sorting by Country, then City, and then by Population to ensure populations from the same country (and city) are plotted together
distance2 <- distance2 %>% arrange(Country, Pop_City, population)
# Adjusting the factor levels for plotting in the desired order
distance2$pop_city_country_label <- fct_inorder(distance2$pop_city_country_label)
# Plotting the data
ggplot(distance2, aes(x = pop_city_country_label, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 3.5) +
my_theme() +
coord_flip() +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"),
axis.text.y = element_text(angle = 0, hjust = 1))
# Create table
dist_table <- distance2 |>
dplyr::select(
population, Pop_City, Country, chr, value
) |>
dplyr::rename(
City = Pop_City,
Population = population
)
# Spread and arrange
dist_table_condensed <- dist_table %>%
spread(key = chr, value = value) %>%
dplyr::select(Population, Country, City, Chr1, Chr2, Chr3) |>
dplyr::arrange(Country, City)
# Create table
ft <- flextable::flextable(dist_table_condensed)
# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)
# Show it
ft
Population | Country | City | Chr1 | Chr2 | Chr3 |
---|---|---|---|---|---|
CAM | Cambodia | Phnom Penh | 18.9 | 115.2 | 139.4 |
HAI | China | Hainan | 1.5 | 63.8 | 66.0 |
HUN | China | Hunan | 12.7 | 57.9 | 57.0 |
YUN | China | Yunnan | 16.1 | 335.6 | 1.0 |
BEN | India | Bengaluru | 18.9 | 194.6 | 207.6 |
INJ | Indonesia | Jakarta | 37.3 | 91.1 | 58.8 |
SUF | Indonesia | Sulawesi (Forest) | 39.9 | 333.7 | 163.1 |
SUU | Indonesia | Sulawesi (Urban) | 63.8 | 353.5 | 278.7 |
KAG | Japan | Kagoshima | 45.7 | 30.3 | 3.9 |
KAN | Japan | Kanazawa | 42.0 | 46.4 | 57.7 |
OKI | Japan | Okinawa | 6.1 | 21.4 | 63.0 |
UTS | Japan | Utsunomiya | 33.9 | 14.6 | 25.0 |
MAT | Malaysia | Tambun | 25.5 | 157.4 | 168.3 |
TAI | Taiwan | Tainan | 9.9 | 261.2 | 121.4 |
CHA | Thailand | Chanthaburi | 12.3 | 174.9 | 96.6 |
KAC | Thailand | Kanchanaburi | 22.3 | 378.1 | 46.7 |
LAM | Thailand | Lampang | 18.9 | 281.7 | 88.8 |
SSK | Thailand | Sisaket | 3.1 | 170.6 | 147.8 |
HOC | Vietnam | Ho Chi Minh City | 54.6 | 182.5 | 146.0 |
QNC | Vietnam | Quy Nhon City | 3.5 | 118.2 | 6.6 |
# Create a new Word document
doc <- read_docx()
# Add the flextable
doc <- body_add_flextable(doc, value = ft)
# Save the document to a file
# Define the path for saving the Word document
file_path <- here("output", "populations", "linkage", "decay_albopictus.docx")
print(doc, target = file_path)
Get counts
# Define the path to the .fam file using here
fam_file_path <- here("output", "populations", "linkage", "chr3", "ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
##
## BEN CAM CHA HAI HOC HUN INJ KAC KAG KAN KAT LAM MAT OKI QNC SSK SUF SUU TAI UTS
## 12 12 12 12 7 12 11 6 12 11 8 9 12 11 12 12 6 6 8 12
## YUN
## 9
Merge the data
# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")
# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")
Check correlation
# Chr1 vs Count
plot_Chr1 <- ggplot(merged_data, aes(x = Count, y = Chr1)) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
ggtitle("Correlation between Count and Chr1") +
labs(x = "Count", y = "LD Half Distance - Chr1")
# Chr2 vs Count
plot_Chr2 <- ggplot(merged_data, aes(x = Count, y = Chr2)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Correlation between Count and Chr2") +
labs(x = "Count", y = "LD Half Distance - Chr2")
# Chr3 vs Count
plot_Chr3 <- ggplot(merged_data, aes(x = Count, y = Chr3)) +
geom_point() +
geom_smooth(method = "lm", col = "green") +
ggtitle("Correlation between Count and Chr3") +
labs(x = "Count", y = "LD Half Distance - Chr3")
# Display the plots
print(plot_Chr1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Estimate r2
# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]
# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]
# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]
# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.2268475 , p-value = 0.03375708
## For Chr2: R2 = 0.6429323 , p-value = 2.125989e-05
## For Chr3: R2 = 0.1115358 , p-value = 0.150124
# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]
# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]
# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]
# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.2268475 , p-value = 0.03375708
## For Chr2: R2 = 0.6429323 , p-value = 2.125989e-05
## For Chr3: R2 = 0.1115358 , p-value = 0.150124
Annotate the plots
# Helper function to extract coefficients, R2, and create the label
get_annotation <- function(model) {
coefs <- coef(model)
eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
return(paste(eq, r2, sep = "\n"))
}
# Annotations for each chromosome
annotate_Chr1 <- get_annotation(model_Chr1)
annotate_Chr2 <- get_annotation(model_Chr2)
annotate_Chr3 <- get_annotation(model_Chr3)
# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
ggplot(data, aes(x = Count, y = !!sym(yvar))) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.1, label = label, hjust = 0) +
labs(x = "Count", y = yvar)
}
plot1 <- plot_with_annotation(merged_data, "Chr1", annotate_Chr1)
plot2 <- plot_with_annotation(merged_data, "Chr2", annotate_Chr2)
plot3 <- plot_with_annotation(merged_data, "Chr3", annotate_Chr3)
# Display the plots with annotations
print(plot1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Create a facet plot
# Read the data
merged_data_long <- readRDS(here("output", "populations", "linkage", "merged_data_long.rds"))
# Function to compute the linear model details
get_lm_details <- function(data, yvar) {
model <- lm(data[[yvar]] ~ data$Count)
coefs <- coef(model)
eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
return(tibble(Chromosome = yvar, Equation = eq, R2 = r2))
}
# Compute details for each chromosome and bind rows
annotations <- bind_rows(get_lm_details(merged_data, "Chr1"),
get_lm_details(merged_data, "Chr2"),
get_lm_details(merged_data, "Chr3"))
# Compute maximum y for annotations for each chromosome
annotations <- annotations %>%
left_join(merged_data_long %>% group_by(Chromosome) %>% summarise(MaxY = max(LD_Half_Distance, na.rm = TRUE)), by = "Chromosome")
# Plotting with annotations correctly positioned in the top right corner
ggplot(merged_data_long, aes(x = Count, y = LD_Half_Distance)) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
geom_text(data = annotations, aes(label = paste(Equation, R2, sep = "\n"), y = MaxY),
x = max(merged_data$Count), hjust = 1, vjust = 1) +
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
labs(x = "Number of Samples", y = "LD Half Distance (kb)") +
my_theme()
## `geom_smooth()` using formula = 'y ~ x'
# Save the plot
ggsave(here("output", "populations", "linkage", "decay_sample_size.pdf"), width = 4, height = 6)
## `geom_smooth()` using formula = 'y ~ x'
Plot all countries
# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
group_by(Country, Chromosome) %>%
summarise(mean_value = mean(LD_Half_Distance, na.rm = TRUE), .groups = "drop")
# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Country, y = LD_Half_Distance)) +
geom_boxplot(alpha = 0.5) +
geom_jitter(aes(color = Country), width = 0.2) +
geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 3) + # Annotate mean value
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
theme_minimal() +
labs(title = "", y = "LD Half Distance (kb)") +
my_theme() +
coord_flip() +
theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none")
# Save the plot
ggsave(here("output", "populations", "linkage", "decay_mean_by_country.pdf"), width = 6, height = 6)
Legend: The Linkage Disequilibrium (LD) half distance values in kilobases (kb) for the Aedes albopictus mosquito across various Asian countries, grouped by chromosome. Each boxplot displays the interquartile range of the LD Half Distance values, with the horizontal line in the box marking the median. Colored jittered points represent individual data points, with each color corresponding to a specific country. The plot also annotates each country-chromosome combination’s mean LD half distance. Distinct facets separate the values for different chromosomes to offer clear visualization.
Use ggstatplot to test if there is any significant differences Chr1
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr1, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr1,
title = "Chr1",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
# Save the plot
ggsave(here("output", "populations", "linkage", "decay_chr1_test.pdf"), width = 7, height = 6)
Chr2
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr2, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr2,
title = "Chr2",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
# Save the plot
ggsave(here("output", "populations", "linkage", "decay_chr2_test.pdf"), width = 7, height = 6)
Chr3
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr3, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr3,
title = "Chr3",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")