library(StAMPP)
library(ggplot2)
library(tidyverse)
library(adegenet)
library(here)
library(flextable)
library(officer)
library(reshape2)
library(dplyr)
library(tidyr)
library(geosphere)
library(flextable)
library(officer)
library(dartR)
library(MASS)
library(Cairo)
We can use different data sets to run our fst estimates.
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 5834115 311.6 9855684 526.4 NA 7331362 391.6
## Vcells 9598989 73.3 14832738 113.2 32768 12293949 93.8
We can estimate Fst using only the neutral set of SNPs for populations with at least 4 individuals.
Create list of populations
awk '{print $1}' output/populations/snps_sets/neutral.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4fst.txt;
head output/fst/pops_4fst.txt;
wc -l output/fst/pops_4fst.txt
## BEN
## CAM
## CHA
## HAI
## HAN
## HOC
## HUN
## INJ
## INW
## KAC
## 25 output/fst/pops_4fst.txt
We have 25 populations with 4 or more mosquitoes. We can convert to raw format and subset the bed file
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/neutral \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/neutral \
--silent;
grep 'samples\|variants\|remaining' output/fst/neutral.log
## 9483 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.96967.
## 9483 variants and 230 people pass filters and QC.
Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations
neutral <-
read.PLINK(
here(
"output", "fst", "neutral.raw"
),
quiet = FALSE,
chunkSize = 1000,
parallel = require("parallel"),
n.cores = 4
)
summary(neutral)
Now lets convert the genlight object to Stampp format, and estimate pairwide Fst values
The command below would also work, but you can simplify it and put only the numbers: genome_equal_2 <- stamppFst(neutral, nboots=100, percent=95 + nclusters==10)
This chunk will take a couple minutes to run.
# convert
neutral_2 <- stamppConvert(neutral, type="genlight")
# run stampp. If you want to run with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
neutral_3 <- stamppFst(neutral_2, 1, 95, 1)
Save it
To load it
Now lets look at the object
## OKI HAI YUN KAT
## Min. :0.06693 Min. :0.02807 Min. :-0.002161 Min. :0.06445
## 1st Qu.:0.08088 1st Qu.:0.03572 1st Qu.: 0.017006 1st Qu.:0.07817
## Median :0.09944 Median :0.05998 Median : 0.055814 Median :0.09368
## Mean :0.11276 Mean :0.06803 Mean : 0.056186 Mean :0.11228
## 3rd Qu.:0.14177 3rd Qu.:0.09611 3rd Qu.: 0.082102 3rd Qu.:0.13943
## Max. :0.21810 Max. :0.15999 Max. : 0.143292 Max. :0.21573
## NA's :1 NA's :2 NA's :3 NA's :4
## TAI HUN HAN KLP
## Min. :0.1233 Min. :0.03590 Min. :0.03056 Min. :0.04714
## 1st Qu.:0.1495 1st Qu.:0.04296 1st Qu.:0.04201 1st Qu.:0.05136
## Median :0.1642 Median :0.05908 Median :0.08134 Median :0.10614
## Mean :0.1780 Mean :0.07034 Mean :0.08233 Mean :0.10225
## 3rd Qu.:0.2007 3rd Qu.:0.09196 3rd Qu.:0.11386 3rd Qu.:0.12868
## Max. :0.3004 Max. :0.16409 Max. :0.19157 Max. :0.21872
## NA's :5 NA's :6 NA's :7 NA's :8
## HOC QNC SSK KAC
## Min. :0.03383 Min. :0.07153 Min. :0.000295 Min. :-0.000224
## 1st Qu.:0.04598 1st Qu.:0.07691 1st Qu.:0.010202 1st Qu.: 0.014289
## Median :0.07343 Median :0.13239 Median :0.069284 Median : 0.075309
## Mean :0.08110 Mean :0.12117 Mean :0.056040 Mean : 0.060492
## 3rd Qu.:0.10792 3rd Qu.:0.14787 3rd Qu.:0.083976 3rd Qu.: 0.085694
## Max. :0.17718 Max. :0.21538 Max. :0.134878 Max. : 0.145295
## NA's :9 NA's :10 NA's :11 NA's :12
## CHA KAN UTS CAM
## Min. :0.003151 Min. :0.09212 Min. :0.07893 Min. :0.008014
## 1st Qu.:0.017775 1st Qu.:0.11198 1st Qu.:0.09267 1st Qu.:0.019739
## Median :0.077002 Median :0.12201 Median :0.10527 Median :0.070429
## Mean :0.063989 Mean :0.14434 Mean :0.12345 Mean :0.064043
## 3rd Qu.:0.087944 3rd Qu.:0.16900 3rd Qu.:0.14159 3rd Qu.:0.081284
## Max. :0.131817 Max. :0.23897 Max. :0.21422 Max. :0.135431
## NA's :13 NA's :14 NA's :15 NA's :16
## KAG BEN LAM SUF
## Min. :0.08122 Min. :0.02333 Min. :0.01692 Min. :0.03162
## 1st Qu.:0.09061 1st Qu.:0.05390 1st Qu.:0.06908 1st Qu.:0.06323
## Median :0.12139 Median :0.08997 Median :0.08394 Median :0.06969
## Mean :0.12750 Mean :0.07831 Mean :0.08446 Mean :0.09810
## 3rd Qu.:0.14741 3rd Qu.:0.10404 3rd Qu.:0.10618 3rd Qu.:0.09388
## Max. :0.20375 Max. :0.11899 Max. :0.14422 Max. :0.23208
## NA's :17 NA's :18 NA's :19 NA's :20
## SUU KUN INW INJ
## Min. :0.06603 Min. :0.1486 Min. :0.06648 Min. :0.08835
## 1st Qu.:0.07088 1st Qu.:0.1799 1st Qu.:0.08016 1st Qu.:0.08835
## Median :0.08605 Median :0.2113 Median :0.09384 Median :0.08835
## Mean :0.11206 Mean :0.2123 Mean :0.09384 Mean :0.08835
## 3rd Qu.:0.12723 3rd Qu.:0.2441 3rd Qu.:0.10751 3rd Qu.:0.08835
## Max. :0.21012 Max. :0.2769 Max. :0.12119 Max. :0.08835
## NA's :21 NA's :22 NA's :23 NA's :24
## MAT
## Min. : NA
## 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :25
If you want you can save the fst values as csv.
# Convert to data frame
neutral_df <- data.frame(neutral_3)
# Save it
write.csv(neutral_df, file = here("output", "fst", "neutral_df.csv"))
Check the Fst values
## OKI HAI YUN KAT TAI HUN HAN KLP HOC QNC
## OKI NA NA NA NA NA NA NA NA NA NA
## HAI 0.07268706 NA NA NA NA NA NA NA NA NA
## YUN 0.07937617 0.03397451 NA NA NA NA NA NA NA NA
## KAT 0.14886465 0.10179042 0.07762791 NA NA NA NA NA NA NA
## TAI 0.15614563 0.12689237 0.14329227 0.2157330 NA NA NA NA NA NA
## HUN 0.06693096 0.02807325 0.04026851 0.1034478 0.1233325 NA NA NA NA NA
## SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## HAI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## YUN NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## KAT NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## TAI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## HUN NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
We will convert the data into a matrix.
## OKI HAI YUN KAT TAI HUN HAN
## OKI NA 0.07268706 0.07937617 0.14886465 0.1561456 0.06693096 0.09294157
## HAI 0.07268706 NA 0.03397451 0.10179042 0.1268924 0.02807325 0.03807541
## YUN 0.07937617 0.03397451 NA 0.07762791 0.1432923 0.04026851 0.03421635
## KAT 0.14886465 0.10179042 0.07762791 NA 0.2157330 0.10344776 0.11088115
## TAI 0.15614563 0.12689237 0.14329227 0.21573302 NA 0.12333248 0.16471928
## HUN 0.06693096 0.02807325 0.04026851 0.10344776 0.1233325 NA 0.04385787
## KLP HOC QNC SSK KAC CHA
## OKI 0.12753263 0.07966691 0.14381991 0.080917127 0.082764633 0.0807573335
## HAI 0.07062727 0.02857376 0.09737992 0.036399112 0.031226075 0.0350480370
## YUN 0.04892423 0.03983731 0.07317901 0.001820884 -0.002160858 0.0007563386
## KAT 0.12941018 0.11252413 0.14504693 0.078173740 0.078066760 0.0766586019
## TAI 0.20155700 0.14637599 0.20875730 0.142121310 0.152625899 0.1397690667
## HUN 0.07440170 0.03618065 0.10249995 0.042059268 0.038117472 0.0413984260
## KAN UTS CAM KAG BEN LAM
## OKI 0.13456942 0.10960357 0.075330223 0.10320709 0.09568234 0.08756987
## HAI 0.09711038 0.07278214 0.030013229 0.06963227 0.04835553 0.03913709
## YUN 0.10828863 0.08297531 0.002435562 0.07597810 0.01756076 0.00724616
## KAT 0.17428707 0.14702796 0.075728451 0.13942968 0.08425904 0.08100140
## TAI 0.19294422 0.17076422 0.136334355 0.16363753 0.15654314 0.15057531
## HUN 0.08888333 0.06669664 0.035903383 0.05908263 0.05334759 0.04549865
## SUF SUU KUN INW INJ MAT
## OKI 0.14108869 0.11063606 0.2181045 0.18145357 0.14398625 0.09257954
## HAI 0.09101011 0.05998414 0.1599904 0.12872924 0.09510255 0.04482997
## YUN 0.08325162 0.06270272 0.1364487 0.10513394 0.07948148 0.01682092
## KAT 0.06444802 0.06708021 0.2034093 0.09367875 0.08599529 0.09162392
## TAI 0.21266171 0.17599097 0.3003592 0.26676013 0.20046117 0.15440513
## HUN 0.09503274 0.06400679 0.1640872 0.13267018 0.09920459 0.05348803
Import sample locations
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Arrange by region
sampling_loc <- sampling_loc |>
dplyr::arrange(
Region2, Country
)
# Check it
head(sampling_loc)
## # A tibble: 6 × 7
## Pop_City Country Latitude Longitude Region Abbreviation Region2
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 Hainan China 19.2 110. Asia HAI East Asia
## 2 Yunnan China 24.5 101. Asia YUN East Asia
## 3 Hunan China 27.6 112. Asia HUN East Asia
## 4 Okinawa Japan 26.5 128. Asia OKI East Asia
## 5 Sendai Japan 38.3 141. Asia SEN East Asia
## 6 Nagasaki Japan 32.8 130. Asia NAG East Asia
Order
## [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"
Create vector with order of populations
# Extract the populations that appear in neutral_df
populations_in_neutral <- colnames(neutral_df)
# Reorder the populations based on order_pops
poporder <- populations_in_neutral[populations_in_neutral %in% order_pops]
# Print the reordered populations
print(poporder)
## [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"
Lets check if the matrix is symmetric.
## [1] TRUE
Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.
Now we have to convert the matrix to a data frame to plot it with ggplot.
## Var1 Var2 value
## OKI : 25 OKI : 25 Min. :-0.0022
## HAI : 25 HAI : 25 1st Qu.: 0.0589
## YUN : 25 YUN : 25 Median : 0.0871
## KAT : 25 KAT : 25 Mean : 0.0959
## TAI : 25 TAI : 25 3rd Qu.: 0.1325
## HUN : 25 HUN : 25 Max. : 0.3004
## (Other):475 (Other):475 NA's :325
Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.
pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient(
low = "white",
high = "#71b6ff",
name = "Fst",
na.value = "white",
limits = c(0, 0.5)
) +
scale_x_discrete(position = "top") +
theme_bw() +
geom_text(aes(label = ifelse(
is.na(value), "", formatC(value, digits = 2, format = "f")
)), size = 3) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(hjust = 0)
)
pairfst.f
ggsave(
filename = here("output", "fst", "fst_matrix_neutral.pdf"),
pairfst.f,
width = 10,
height = 10,
units = "in"
)
By country
# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)
# Step 2: Calculate mean Fst for each pair of countries
# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(neutral_df))
fst_df$Abbreviation1 <- rownames(fst_df)
# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)
# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))
# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>%
group_by(Country_1, Country_2) %>%
summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>%
filter(Country_1 != Country_2)
# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)
# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)
for(i in 1:nrow(fst_matrix_summary)) {
for(j in i:nrow(fst_matrix_summary)) {
if (i == j) {
symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
} else {
avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
symmetric_fst_matrix[i, j] <- avg_value
symmetric_fst_matrix[j, i] <- avg_value
}
}
}
# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))
# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
## Cambodia China India Indonesia Japan Malaysia
## Cambodia NA 0.02278406 0.01973887 0.08102356 0.07791002 0.03291285
## China 0.02278406 NA 0.03975463 0.09135918 0.07657845 0.05151535
## India 0.01973887 0.03975463 NA 0.09289674 0.10153368 0.05126178
## Indonesia 0.08102356 0.09135918 0.09289674 NA 0.14968381 0.11478857
## Japan 0.07791002 0.07657845 0.10153368 0.14968381 NA 0.11864271
## Malaysia 0.03291285 0.05151535 0.05126178 0.11478857 0.11864271 NA
## Maldives 0.13543120 0.15350879 0.11898677 0.23259152 0.21875971 0.18365581
## Nepal 0.07572845 0.09657847 0.08425904 0.07780057 0.15122311 0.11051705
## Taiwan 0.13633435 0.12921240 0.15654314 0.21396849 0.16596381 0.17798106
## Thailand 0.00533491 0.02637889 0.02054397 0.08322300 0.08912422 0.03349593
## Vietnam 0.04581641 0.05486669 0.06564601 0.12518159 0.10974240 0.08606174
## Maldives Nepal Taiwan Thailand Vietnam
## Cambodia 0.1354312 0.07572845 0.1363344 0.00533491 0.04581641
## China 0.1535088 0.09657847 0.1292124 0.02637889 0.05486669
## India 0.1189868 0.08425904 0.1565431 0.02054397 0.06564601
## Indonesia 0.2325915 0.07780057 0.2139685 0.08322300 0.12518159
## Japan 0.2187597 0.15122311 0.1659638 0.08912422 0.10974240
## Malaysia 0.1836558 0.11051705 0.1779811 0.03349593 0.08606174
## Maldives NA 0.20340929 0.3003592 0.13905356 0.19471054
## Nepal 0.2034093 NA 0.2157330 0.07847512 0.12281740
## Taiwan 0.3003592 0.21573302 NA 0.14627290 0.17328419
## Thailand 0.1390536 0.07847512 0.1462729 NA 0.05095433
## Vietnam 0.1947105 0.12281740 0.1732842 0.05095433 NA
## Cambodia China India Indonesia Japan Malaysia
## Cambodia NA 0.02278406 0.01973887 0.08102356 0.07791002 0.03291285
## China NA NA 0.03975463 0.09135918 0.07657845 0.05151535
## India NA NA NA 0.09289674 0.10153368 0.05126178
## Indonesia NA NA NA NA 0.14968381 0.11478857
## Japan NA NA NA NA NA 0.11864271
## Malaysia NA NA NA NA NA NA
## Maldives NA NA NA NA NA NA
## Nepal NA NA NA NA NA NA
## Taiwan NA NA NA NA NA NA
## Thailand NA NA NA NA NA NA
## Vietnam NA NA NA NA NA NA
## Maldives Nepal Taiwan Thailand Vietnam
## Cambodia 0.1354312 0.07572845 0.1363344 0.00533491 0.04581641
## China 0.1535088 0.09657847 0.1292124 0.02637889 0.05486669
## India 0.1189868 0.08425904 0.1565431 0.02054397 0.06564601
## Indonesia 0.2325915 0.07780057 0.2139685 0.08322300 0.12518159
## Japan 0.2187597 0.15122311 0.1659638 0.08912422 0.10974240
## Malaysia 0.1836558 0.11051705 0.1779811 0.03349593 0.08606174
## Maldives NA 0.20340929 0.3003592 0.13905356 0.19471054
## Nepal NA NA 0.2157330 0.07847512 0.12281740
## Taiwan NA NA NA 0.14627290 0.17328419
## Thailand NA NA NA NA 0.05095433
## Vietnam NA NA NA NA NA
Now we have to convert the matrix to a data frame to plot it with ggplot.
## Var1 Var2 value
## Cambodia :11 Cambodia :11 Min. :0.00533
## China :11 China :11 1st Qu.:0.07069
## India :11 India :11 Median :0.10153
## Indonesia:11 Indonesia:11 Mean :0.11113
## Japan :11 Japan :11 3rd Qu.:0.15045
## Malaysia :11 Malaysia :11 Max. :0.30036
## (Other) :55 (Other) :55 NA's :66
You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.
pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient(
low = "white",
high = "#71b6ff",
name = "Fst",
na.value = "white",
limits = c(0, 0.5)
) +
scale_x_discrete(position = "top") +
theme_bw() +
geom_text(aes(label = ifelse(
is.na(value), "", formatC(value, digits = 2, format = "f")
)), size = 3) +
theme(
axis.text.x = element_text(angle = 90, hjust = 0),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(hjust = 1)
)
pairfst.f2
ggsave(
filename = here("output", "fst", "fst_matrix_neutral_by_country.pdf"),
pairfst.f2,
width = 6,
height = 5,
units = "in"
)
Remove NAs and rename columns
# remove NAs
fst2 <-
pairfst.long |>
drop_na()
# rename columns
fst2 <-
fst2 |>
dplyr::rename(pop1 = 1,
pop2 = 2,
fst = 3)
# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
dplyr::select(pop = pop2, fst)
# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)
# Calculate the mean fst for each population
mean_fst <- df_combined |>
group_by(pop) |>
summarise(mean_fst = mean(fst))
print(mean_fst)
## # A tibble: 25 × 2
## pop mean_fst
## <fct> <dbl>
## 1 OKI 0.113
## 2 HAI 0.0682
## 3 YUN 0.0562
## 4 KAT 0.112
## 5 TAI 0.175
## 6 HUN 0.0708
## 7 HAN 0.0819
## 8 KLP 0.103
## 9 HOC 0.0777
## 10 QNC 0.123
## # ℹ 15 more rows
Merge
fst3 <-
sampling_loc |>
left_join(
mean_fst,
by = c("Abbreviation" = "pop")
) |>
drop_na() |>
dplyr::select(
-Region
)
# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)
# Rename the Region2 column to Region
fst3 <- fst3 |>
dplyr::rename(Region = Region2)
# check output
head(fst3)
## # A tibble: 6 × 7
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0682
## 2 Yunnan China 24.5 101. YUN East 0.0562
## 3 Hunan China 27.6 112. HUN East 0.0708
## 4 Okinawa Japan 26.5 128. OKI East 0.113
## 5 Kanazawa Japan 36.6 137. KAN East 0.136
## 6 Utsunomiya Japan 36.6 140. UTS East 0.112
Mean by region
# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
group_by(Region) |>
summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
left_join(region_means, by = "Region")
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 8
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0682
## 2 Yunnan China 24.5 101. YUN East 0.0562
## 3 Hunan China 27.6 112. HUN East 0.0708
## 4 Okinawa Japan 26.5 128. OKI East 0.113
## 5 Kanazawa Japan 36.6 137. KAN East 0.136
## 6 Utsunomiya Japan 36.6 140. UTS East 0.112
## 7 Kagoshima Japan 31.6 131. KAG East 0.106
## 8 Tainan Taiwan 23.0 120. TAI East 0.175
## 9 Bengaluru India 13.0 77.6 BEN South 0.0689
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.189
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>
Mean by country
# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
group_by(Country) |>
summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
left_join(country_means, by = "Country")
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 9
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0682
## 2 Yunnan China 24.5 101. YUN East 0.0562
## 3 Hunan China 27.6 112. HUN East 0.0708
## 4 Okinawa Japan 26.5 128. OKI East 0.113
## 5 Kanazawa Japan 36.6 137. KAN East 0.136
## 6 Utsunomiya Japan 36.6 140. UTS East 0.112
## 7 Kagoshima Japan 31.6 131. KAG East 0.106
## 8 Tainan Taiwan 23.0 120. TAI East 0.175
## 9 Bengaluru India 13.0 77.6 BEN South 0.0689
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.189
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>
Mean by latitude
# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))
# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
group_by(Latitude_group) |>
summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
left_join(summary_by_latitude, by = "Latitude_group")
# Rename columns
fst3 <- fst3 |>
dplyr::rename(
City = Pop_City
)
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 11
## City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0682
## 2 Yunnan China 24.5 101. YUN East 0.0562
## 3 Hunan China 27.6 112. HUN East 0.0708
## 4 Okinawa Japan 26.5 128. OKI East 0.113
## 5 Kanazawa Japan 36.6 137. KAN East 0.136
## 6 Utsunomiya Japan 36.6 140. UTS East 0.112
## 7 Kagoshima Japan 31.6 131. KAG East 0.106
## 8 Tainan Taiwan 23.0 120. TAI East 0.175
## 9 Bengaluru India 13.0 77.6 BEN South 0.0689
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.189
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## # Latitude_group <chr>, mean_fst_by_latitude <dbl>
fst4 <- fst3 |>
dplyr::select(
Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
)
fst4 <- fst4 |>
arrange(
Latitude_group, Region, Country, City
)
# Round
fst4 <- fst4 |>
mutate_if(is.numeric, ~ round(., 2))
head(fst4)
## # A tibble: 6 × 9
## Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
## <chr> <dbl> <chr> <dbl> <chr>
## 1 Above 30N 0.12 East 0.1 Japan
## 2 Above 30N 0.12 East 0.1 Japan
## 3 Above 30N 0.12 East 0.1 Japan
## 4 Below 30N 0.09 East 0.1 China
## 5 Below 30N 0.09 East 0.1 China
## 6 Below 30N 0.09 East 0.1 China
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## # Abbreviation <chr>, mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
font.family = "Arial",
font.size = 9,
big.mark = ",",
theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)
# Then create the flextable object
flex_table <- flextable(fst4) |>
set_caption(caption = as_paragraph(
as_chunk(
"Table 1. Fst values using intergenic SNPs.",
props = fp_text_default(color = "#000000", font.size = 14)
)
),
fp_p = fp_par(text.align = "center", padding = 5))
# Print the flextable
flex_table
Latitude_group | mean_fst_by_latitude | Region | mean_fst_by_region | Country | mean_fst_by_country | City | Abbreviation | mean_fst |
---|---|---|---|---|---|---|---|---|
Above 30N | 0.12 | East | 0.10 | Japan | 0.12 | Kagoshima | KAG | 0.11 |
Above 30N | 0.12 | East | 0.10 | Japan | 0.12 | Kanazawa | KAN | 0.14 |
Above 30N | 0.12 | East | 0.10 | Japan | 0.12 | Utsunomiya | UTS | 0.11 |
Below 30N | 0.09 | East | 0.10 | China | 0.07 | Hainan | HAI | 0.07 |
Below 30N | 0.09 | East | 0.10 | China | 0.07 | Hunan | HUN | 0.07 |
Below 30N | 0.09 | East | 0.10 | China | 0.07 | Yunnan | YUN | 0.06 |
Below 30N | 0.09 | East | 0.10 | Japan | 0.12 | Okinawa | OKI | 0.11 |
Below 30N | 0.09 | East | 0.10 | Taiwan | 0.18 | Tainan | TAI | 0.18 |
Below 30N | 0.09 | South | 0.12 | India | 0.07 | Bengaluru | BEN | 0.07 |
Below 30N | 0.09 | South | 0.12 | Maldives | 0.19 | Kunfunadhoo | KUN | 0.19 |
Below 30N | 0.09 | South | 0.12 | Nepal | 0.11 | Kathmandu | KAT | 0.11 |
Below 30N | 0.09 | Southeast | 0.09 | Cambodia | 0.05 | Phnom Penh | CAM | 0.05 |
Below 30N | 0.09 | Southeast | 0.09 | Indonesia | 0.11 | Jakarta | INJ | 0.11 |
Below 30N | 0.09 | Southeast | 0.09 | Indonesia | 0.11 | Sulawesi (Forest) | SUF | 0.11 |
Below 30N | 0.09 | Southeast | 0.09 | Indonesia | 0.11 | Sulawesi (Urban) | SUU | 0.09 |
Below 30N | 0.09 | Southeast | 0.09 | Indonesia | 0.11 | Wainyapu | INW | 0.14 |
Below 30N | 0.09 | Southeast | 0.09 | Malaysia | 0.09 | Kuala Lumpur | KLP | 0.10 |
Below 30N | 0.09 | Southeast | 0.09 | Malaysia | 0.09 | Tambun | MAT | 0.07 |
Below 30N | 0.09 | Southeast | 0.09 | Thailand | 0.06 | Chanthaburi | CHA | 0.06 |
Below 30N | 0.09 | Southeast | 0.09 | Thailand | 0.06 | Kanchanaburi | KAC | 0.06 |
Below 30N | 0.09 | Southeast | 0.09 | Thailand | 0.06 | Lampang | LAM | 0.06 |
Below 30N | 0.09 | Southeast | 0.09 | Thailand | 0.06 | Sisaket | SSK | 0.06 |
Below 30N | 0.09 | Southeast | 0.09 | Vietnam | 0.09 | Hanoi | HAN | 0.08 |
Below 30N | 0.09 | Southeast | 0.09 | Vietnam | 0.09 | Ho Chi Minh City | HOC | 0.08 |
Below 30N | 0.09 | Southeast | 0.09 | Vietnam | 0.09 | Quy Nhon City | QNC | 0.12 |
# Initialize Word document
doc <-
read_docx() |>
body_add_flextable(value = flex_table)
# Define the output path with 'here' library
output_path <- here(
"output",
"fst",
"fst_neutral_SNPS.docx"
)
# Save the Word document
print(doc, target = output_path)
To make scatter plot
# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
dplyr::group_by(Country) |>
dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))
# save the data
saveRDS(aggregated_data, here(
"output", "fst", "neutral_country.rds"
))
# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]
# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)
# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)
# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)
ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
geom_point(aes(color = Country), size = 3) +
geom_line(aes(y = fitted_values), color = "blue") + # Fitted line
labs(
title = "Mean Fst by Country",
x = "Ordered Countries",
y = "Mean Fst Value"
) +
scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none")
Estimate distances
# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)
# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)
# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation
# Calculate the distances
for (i in 1:n) {
for (j in 1:n) {
if (i != j) {
coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
}
}
}
# Print the distance matrix
head(distance_matrix)
## HAI YUN HUN OKI KAN UTS KAG
## HAI 0.0000 1035.308 965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081 0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN 965.7432 1107.931 0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283 0.000 1390.3468 1589.7383 617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347 0.0000 288.4938 790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738 288.4938 0.0000 1023.2713
## TAI BEN KUN KAT CAM SUF SUU INW
## HAI 1180.1356 3487.311 4241.329 2648.016 987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN 985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI 873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
## INJ KLP MAT SSK KAC CHA LAM HAN
## HAI 2844.575 1984.417 1861.932 722.0761 1218.270 1086.825 1070.0656 441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303 714.4744 601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029 953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
## HOC QNC
## HAI 990.911 604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166
Compare distance and FST
# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]
# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0
# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))
# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))
data <- data |>
dplyr::arrange(
Distance
)
head(data)
## Distance FST row_index col_index
## 1 0 0 HAI HAI
## 2 0 0 YUN YUN
## 3 0 0 HUN HUN
## 4 0 0 OKI OKI
## 5 0 0 KAN KAN
## 6 0 0 UTS UTS
Fit linear regression
# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]
# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))
# Plot
ggplot(data, aes(x = Distance, y = FST)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
annotate("text", x = max(data$Distance) * 0.85, y = max(data$FST) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
labs(title = "FST vs Distance - All populations",
x = "Distance (Km)",
y = "FST") +
scale_x_continuous(labels = scales::comma) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
ggsave(
filename = here("output", "fst", "fst_by_distance_all_samples.pdf"),
width = 6,
height = 4,
units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'
Subset by country Select coutries with at least 3 sampling localities
countries_with_3_pops <- filtered_sampling_loc %>%
group_by(Country) %>%
filter(n() >= 3) %>%
pull(Country) %>%
unique()
countries_with_3_pops
## [1] "China" "Japan" "Indonesia" "Thailand" "Vietnam"
Do test for each country
results <- list()
for (country in countries_with_3_pops) {
# Extract abbreviations for the country
abbreviations <- filtered_sampling_loc %>%
filter(Country == country) %>%
pull(Abbreviation)
# Subset the data
subset_data <- data %>%
filter(row_index %in% abbreviations & col_index %in% abbreviations)
# Perform linear regression
lm_model <- lm(FST ~ Distance, data = subset_data)
results[[country]] <- list(
equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
)
}
results
## $China
## $China$equation
## [1] "y = -0.00032x + 0.395"
##
## $China$r2
## [1] "R^2 = 0.87"
##
##
## $Japan
## $Japan$equation
## [1] "y = 0.00002x + 0.111"
##
## $Japan$r2
## [1] "R^2 = 0.02"
##
##
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.00002x + 0.105"
##
## $Indonesia$r2
## [1] "R^2 = 0.11"
##
##
## $Thailand
## $Thailand$equation
## [1] "y = 0.00038x + -0.063"
##
## $Thailand$r2
## [1] "R^2 = 0.46"
##
##
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.00002x + 0.108"
##
## $Vietnam$r2
## [1] "R^2 = 0.07"
Merge the data
data_merged <- data %>%
left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
rename(Country1 = Country) %>%
left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
dplyr::select(-Pop_City.x, -Pop_City.y) %>%
filter(Country1 == Country) # Ensures the data is within the same country
# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")
# Filter
data_filtered <- data_merged %>%
group_by(Country1) %>%
filter(n() >= 3 & Country1 %in% countries_to_include) %>%
ungroup()
Calculate the linear regression for each country
regression_results <- data_filtered %>%
group_by(Country1) %>%
do(model = lm(FST ~ Distance, data = .)) %>%
rowwise() %>%
mutate(equation = sprintf("y = %.3fx + %.3f", coef(model)[2], coef(model)[1]),
r2 = sprintf("R^2 = %.2f", summary(model)$r.squared))
Plot it
ggplot(data_filtered, aes(x = Distance, y = FST)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ Country1, scales = "free", ncol = 2) +
geom_text(data = regression_results, aes(label = paste(equation, r2, sep = "\n"), x = Inf, y = Inf),
vjust = 2, hjust = 2, size = 3.5, inherit.aes = FALSE) +
labs(title = "FST vs Distance by Country",
x = "Distance",
y = "FST") +
scale_x_continuous(labels = scales::comma) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
ggsave(
filename = here("output", "fst", "fst_by_distance_countries.pdf"),
width = 6,
height = 8,
units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'
We can merge the FST and distance matrices
# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)
# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]
# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names
# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]
# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
for(j in 1:ncol(merged_matrix)) {
if (i < j) {
# Upper triangular - Fst values with two decimal places
merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
} else if (i > j) {
# Lower triangular - Distance values with zero decimal places
merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
}
}
}
# Now the merged_matrix should be formatted as you need
print(merged_matrix)
## BEN CAM CHA HAI HAN HOC HUN INJ INW KAC
## BEN NA "0.02" "0.02" "0.05" "0.05" "0.06" "0.05" "0.09" "0.12" "0.02"
## CAM "2976" NA "0.00" "0.03" "0.03" "0.03" "0.04" "0.08" "0.11" "0.00"
## CHA "2660" "329" NA "0.04" "0.03" "0.04" "0.04" "0.08" "0.10" "-0.00"
## HAI "3487" "987" "1087" NA "0.04" "0.03" "0.03" "0.10" "0.13" "0.03"
## HAN "3134" "1059" "1019" "441" NA "0.04" "0.04" "0.11" "0.16" "0.03"
## HOC "3174" "209" "537" "991" "1146" NA "0.04" "0.11" "0.15" "0.04"
## HUN "3913" "1929" "1956" "966" "954" "1954" NA "0.10" "0.13" "0.04"
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA "0.07" "0.08"
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA "0.11"
## KAC "2377" "647" "320" "1218" "1028" "856" "1981" "2393" "3492" NA
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713"
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456"
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119"
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234"
## KUN "957" "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052"
## LAM "2419" "951" "692" "1070" "733" "1139" "1637" "2844" "3859" "475"
## MAT "2751" "875" "893" "1862" "1894" "912" "2799" "1363" "2650" "1060"
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265"
## QNC "3424" "528" "782" "604" "882" "436" "1565" "2241" "2880" "1047"
## SSK "2894" "402" "368" "722" "678" "547" "1595" "2390" "3271" "530"
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859" "2874"
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973" "3147"
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986" "3562" "3635" "2397"
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722"
## YUN "2806" "1487" "1323" "1035" "601" "1627" "1108" "3467" "4320" "1179"
## KAG KAN KAT KLP KUN LAM MAT OKI QNC SSK
## BEN "0.09" "0.12" "0.08" "0.07" "0.12" "0.02" "0.03" "0.10" "0.09" "0.02"
## CAM "0.07" "0.10" "0.08" "0.05" "0.14" "0.01" "0.02" "0.08" "0.07" "0.00"
## CHA "0.08" "0.11" "0.08" "0.05" "0.13" "0.01" "0.02" "0.08" "0.07" "0.00"
## HAI "0.07" "0.10" "0.10" "0.07" "0.16" "0.04" "0.04" "0.07" "0.10" "0.04"
## HAN "0.09" "0.12" "0.11" "0.08" "0.19" "0.04" "0.05" "0.09" "0.12" "0.04"
## HOC "0.08" "0.11" "0.11" "0.08" "0.18" "0.05" "0.05" "0.08" "0.11" "0.04"
## HUN "0.06" "0.09" "0.10" "0.07" "0.16" "0.05" "0.05" "0.07" "0.10" "0.04"
## INJ "0.14" "0.17" "0.09" "0.12" "0.21" "0.08" "0.09" "0.14" "0.14" "0.08"
## INW "0.17" "0.21" "0.09" "0.18" "0.28" "0.11" "0.12" "0.18" "0.18" "0.11"
## KAC "0.08" "0.11" "0.08" "0.05" "0.15" "0.00" "0.01" "0.08" "0.07" "0.00"
## KAG NA "0.11" "0.14" "0.12" "0.20" "0.08" "0.09" "0.10" "0.14" "0.08"
## KAN "791" NA "0.17" "0.15" "0.24" "0.11" "0.12" "0.13" "0.17" "0.11"
## KAT "4367" "4880" NA "0.13" "0.20" "0.08" "0.09" "0.15" "0.15" "0.08"
## KLP "4376" "5163" "3242" NA "0.22" "0.05" "0.06" "0.13" "0.13" "0.05"
## KUN "6631" "7300" "2779" "3196" NA "0.14" "0.15" "0.22" "0.22" "0.13"
## LAM "3450" "4160" "1789" "1704" "3202" NA "0.02" "0.09" "0.08" "0.01"
## MAT "4289" "5073" "3069" "177" "3124" "1531" NA "0.09" "0.09" "0.02"
## OKI "618" "1390" "4204" "3822" "6262" "3060" "3747" NA "0.14" "0.08"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA "0.07"
## SSK "3232" "3991" "2412" "1365" "3582" "624" "1217" "2758" "548" NA
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873" "1548" "1886"
## UTS "1023" "288" "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714" "2208" "2678" "1449" "1087"
## SUF SUU TAI UTS YUN
## BEN "0.09" "0.07" "0.16" "0.10" "0.02"
## CAM "0.08" "0.06" "0.14" "0.08" "0.00"
## CHA "0.08" "0.06" "0.14" "0.08" "0.00"
## HAI "0.09" "0.06" "0.13" "0.07" "0.03"
## HAN "0.12" "0.08" "0.16" "0.10" "0.03"
## HOC "0.11" "0.07" "0.15" "0.08" "0.04"
## HUN "0.10" "0.06" "0.12" "0.07" "0.04"
## INJ "0.06" "0.07" "0.20" "0.14" "0.08"
## INW "0.07" "0.10" "0.27" "0.18" "0.11"
## KAC "0.08" "0.06" "0.15" "0.09" "-0.00"
## KAG "0.14" "0.11" "0.16" "0.08" "0.08"
## KAN "0.17" "0.14" "0.19" "0.09" "0.11"
## KAT "0.06" "0.07" "0.22" "0.15" "0.08"
## KLP "0.14" "0.11" "0.20" "0.13" "0.05"
## KUN "0.23" "0.21" "0.30" "0.21" "0.14"
## LAM "0.09" "0.07" "0.15" "0.09" "0.01"
## MAT "0.09" "0.07" "0.15" "0.10" "0.02"
## OKI "0.14" "0.11" "0.16" "0.11" "0.08"
## QNC "0.15" "0.13" "0.21" "0.15" "0.07"
## SSK "0.08" "0.06" "0.14" "0.08" "0.00"
## SUF NA "0.03" "0.21" "0.14" "0.08"
## SUU "381" NA "0.18" "0.11" "0.06"
## TAI "2777" "2747" NA "0.17" "0.14"
## UTS "4755" "4566" "2418" NA "0.08"
## YUN "3566" "3744" "1928" "3902" NA
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
We can sort by distance
# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)
# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))
# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]
# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]
# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
for(j in 1:ncol(merged_matrix)) {
if (i < j) {
merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
} else if (i > j) {
merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
}
}
}
# Print the merged matrix
print(merged_matrix)
## SSK QNC CAM HOC CHA HAI HAN KAC LAM YUN
## SSK NA "0.07" "0.00" "0.04" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## QNC "548" NA "0.07" "0.11" "0.07" "0.10" "0.12" "0.07" "0.08" "0.07"
## CAM "402" "528" NA "0.03" "0.00" "0.03" "0.03" "0.00" "0.01" "0.00"
## HOC "547" "436" "209" NA "0.04" "0.03" "0.04" "0.04" "0.05" "0.04"
## CHA "368" "782" "329" "537" NA "0.04" "0.03" "0.00" "0.01" "0.00"
## HAI "722" "604" "987" "991" "1087" NA "0.04" "0.03" "0.04" "0.03"
## HAN "678" "882" "1059" "1146" "1019" "441" NA "0.03" "0.04" "0.03"
## KAC "530" "1047" "647" "856" "320" "1218" "1028" NA "0.00" "0.00"
## LAM "624" "1155" "951" "1139" "692" "1070" "733" "475" NA "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601" "1179" "714" NA
## MAT "1217" "1348" "875" "912" "893" "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966" "954" "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
## MAT HUN KLP TAI OKI SUF INJ SUU KAT KAG
## SSK "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## QNC "0.09" "0.10" "0.13" "0.21" "0.14" "0.15" "0.14" "0.13" "0.15" "0.14"
## CAM "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.07"
## HOC "0.05" "0.04" "0.08" "0.15" "0.08" "0.11" "0.11" "0.07" "0.11" "0.08"
## CHA "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## HAI "0.04" "0.03" "0.07" "0.13" "0.07" "0.09" "0.10" "0.06" "0.10" "0.07"
## HAN "0.05" "0.04" "0.08" "0.16" "0.09" "0.12" "0.11" "0.08" "0.11" "0.09"
## KAC "0.01" "0.04" "0.05" "0.15" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## LAM "0.02" "0.05" "0.05" "0.15" "0.09" "0.09" "0.08" "0.07" "0.08" "0.08"
## YUN "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## MAT NA "0.05" "0.06" "0.15" "0.09" "0.09" "0.09" "0.07" "0.09" "0.09"
## HUN "2799" NA "0.07" "0.12" "0.07" "0.10" "0.10" "0.06" "0.10" "0.06"
## KLP "177" "2932" NA "0.20" "0.13" "0.14" "0.12" "0.11" "0.13" "0.12"
## TAI "2894" "986" "2979" NA "0.16" "0.21" "0.20" "0.18" "0.22" "0.16"
## OKI "3747" "1599" "3822" "873" NA "0.14" "0.14" "0.11" "0.15" "0.10"
## SUF "2220" "3406" "2115" "2777" "3280" NA "0.06" "0.03" "0.06" "0.14"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA "0.07" "0.09" "0.14"
## SUU "2565" "3468" "2469" "2747" "3154" "381" "1911" NA "0.07" "0.11"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA "0.14"
## KAG "4289" "1860" "4376" "1397" "618" "3893" "4901" "3756" "4367" NA
## INW "2650" "4248" "2505" "3635" "4110" "859" "1528" "973" "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791"
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
## INW BEN KAN UTS KUN
## SSK "0.11" "0.02" "0.11" "0.08" "0.13"
## QNC "0.18" "0.09" "0.17" "0.15" "0.22"
## CAM "0.11" "0.02" "0.10" "0.08" "0.14"
## HOC "0.15" "0.06" "0.11" "0.08" "0.18"
## CHA "0.10" "0.02" "0.11" "0.08" "0.13"
## HAI "0.13" "0.05" "0.10" "0.07" "0.16"
## HAN "0.16" "0.05" "0.12" "0.10" "0.19"
## KAC "0.11" "0.02" "0.11" "0.09" "0.15"
## LAM "0.11" "0.02" "0.11" "0.09" "0.14"
## YUN "0.11" "0.02" "0.11" "0.08" "0.14"
## MAT "0.12" "0.03" "0.12" "0.10" "0.15"
## HUN "0.13" "0.05" "0.09" "0.07" "0.16"
## KLP "0.18" "0.07" "0.15" "0.13" "0.22"
## TAI "0.27" "0.16" "0.19" "0.17" "0.30"
## OKI "0.18" "0.10" "0.13" "0.11" "0.22"
## SUF "0.07" "0.09" "0.17" "0.14" "0.23"
## INJ "0.07" "0.09" "0.17" "0.14" "0.21"
## SUU "0.10" "0.07" "0.14" "0.11" "0.21"
## KAT "0.09" "0.08" "0.17" "0.15" "0.20"
## KAG "0.17" "0.09" "0.11" "0.08" "0.20"
## INW NA "0.12" "0.21" "0.18" "0.28"
## BEN "5347" NA "0.12" "0.10" "0.12"
## KAN "5424" "6410" NA "0.09" "0.24"
## UTS "5539" "6696" "288" NA "0.21"
## KUN "5513" "957" "7300" "7583" NA
Make a table and save as word document
# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)
# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]
# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))
ft
Population | SSK | QNC | CAM | HOC | CHA | HAI | HAN | KAC | LAM | YUN | MAT | HUN | KLP | TAI | OKI | SUF | INJ | SUU | KAT | KAG | INW | BEN | KAN | UTS | KUN |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SSK | 0.07 | 0.00 | 0.04 | 0.00 | 0.04 | 0.04 | 0.00 | 0.01 | 0.00 | 0.02 | 0.04 | 0.05 | 0.14 | 0.08 | 0.08 | 0.08 | 0.06 | 0.08 | 0.08 | 0.11 | 0.02 | 0.11 | 0.08 | 0.13 | |
QNC | 548 | 0.07 | 0.11 | 0.07 | 0.10 | 0.12 | 0.07 | 0.08 | 0.07 | 0.09 | 0.10 | 0.13 | 0.21 | 0.14 | 0.15 | 0.14 | 0.13 | 0.15 | 0.14 | 0.18 | 0.09 | 0.17 | 0.15 | 0.22 | |
CAM | 402 | 528 | 0.03 | 0.00 | 0.03 | 0.03 | 0.00 | 0.01 | 0.00 | 0.02 | 0.04 | 0.05 | 0.14 | 0.08 | 0.08 | 0.08 | 0.06 | 0.08 | 0.07 | 0.11 | 0.02 | 0.10 | 0.08 | 0.14 | |
HOC | 547 | 436 | 209 | 0.04 | 0.03 | 0.04 | 0.04 | 0.05 | 0.04 | 0.05 | 0.04 | 0.08 | 0.15 | 0.08 | 0.11 | 0.11 | 0.07 | 0.11 | 0.08 | 0.15 | 0.06 | 0.11 | 0.08 | 0.18 | |
CHA | 368 | 782 | 329 | 537 | 0.04 | 0.03 | 0.00 | 0.01 | 0.00 | 0.02 | 0.04 | 0.05 | 0.14 | 0.08 | 0.08 | 0.08 | 0.06 | 0.08 | 0.08 | 0.10 | 0.02 | 0.11 | 0.08 | 0.13 | |
HAI | 722 | 604 | 987 | 991 | 1087 | 0.04 | 0.03 | 0.04 | 0.03 | 0.04 | 0.03 | 0.07 | 0.13 | 0.07 | 0.09 | 0.10 | 0.06 | 0.10 | 0.07 | 0.13 | 0.05 | 0.10 | 0.07 | 0.16 | |
HAN | 678 | 882 | 1059 | 1146 | 1019 | 441 | 0.03 | 0.04 | 0.03 | 0.05 | 0.04 | 0.08 | 0.16 | 0.09 | 0.12 | 0.11 | 0.08 | 0.11 | 0.09 | 0.16 | 0.05 | 0.12 | 0.10 | 0.19 | |
KAC | 530 | 1047 | 647 | 856 | 320 | 1218 | 1028 | 0.00 | 0.00 | 0.01 | 0.04 | 0.05 | 0.15 | 0.08 | 0.08 | 0.08 | 0.06 | 0.08 | 0.08 | 0.11 | 0.02 | 0.11 | 0.09 | 0.15 | |
LAM | 624 | 1155 | 951 | 1139 | 692 | 1070 | 733 | 475 | 0.01 | 0.02 | 0.05 | 0.05 | 0.15 | 0.09 | 0.09 | 0.08 | 0.07 | 0.08 | 0.08 | 0.11 | 0.02 | 0.11 | 0.09 | 0.14 | |
YUN | 1087 | 1449 | 1487 | 1627 | 1323 | 1035 | 601 | 1179 | 714 | 0.02 | 0.04 | 0.05 | 0.14 | 0.08 | 0.08 | 0.08 | 0.06 | 0.08 | 0.08 | 0.11 | 0.02 | 0.11 | 0.08 | 0.14 | |
MAT | 1217 | 1348 | 875 | 912 | 893 | 1862 | 1894 | 1060 | 1531 | 2208 | 0.05 | 0.06 | 0.15 | 0.09 | 0.09 | 0.09 | 0.07 | 0.09 | 0.09 | 0.12 | 0.03 | 0.12 | 0.10 | 0.15 | |
HUN | 1595 | 1565 | 1929 | 1954 | 1956 | 966 | 954 | 1981 | 1637 | 1108 | 2799 | 0.07 | 0.12 | 0.07 | 0.10 | 0.10 | 0.06 | 0.10 | 0.06 | 0.13 | 0.05 | 0.09 | 0.07 | 0.16 | |
KLP | 1365 | 1445 | 1003 | 1011 | 1055 | 1984 | 2042 | 1234 | 1704 | 2375 | 177 | 2932 | 0.20 | 0.13 | 0.14 | 0.12 | 0.11 | 0.13 | 0.12 | 0.18 | 0.07 | 0.15 | 0.13 | 0.22 | |
TAI | 1886 | 1548 | 2061 | 1982 | 2237 | 1180 | 1497 | 2397 | 2219 | 1928 | 2894 | 986 | 2979 | 0.16 | 0.21 | 0.20 | 0.18 | 0.22 | 0.16 | 0.27 | 0.16 | 0.19 | 0.17 | 0.30 | |
OKI | 2758 | 2410 | 2929 | 2839 | 3110 | 2047 | 2328 | 3265 | 3060 | 2678 | 3747 | 1599 | 3822 | 873 | 0.14 | 0.14 | 0.11 | 0.15 | 0.10 | 0.18 | 0.10 | 0.13 | 0.11 | 0.22 | |
SUF | 2567 | 2117 | 2245 | 2046 | 2557 | 2614 | 2986 | 2874 | 3182 | 3566 | 2220 | 3406 | 2115 | 2777 | 3280 | 0.06 | 0.03 | 0.06 | 0.14 | 0.07 | 0.09 | 0.17 | 0.14 | 0.23 | |
INJ | 2390 | 2241 | 1989 | 1889 | 2160 | 2845 | 3034 | 2393 | 2844 | 3467 | 1363 | 3805 | 1188 | 3562 | 4294 | 1535 | 0.07 | 0.09 | 0.14 | 0.07 | 0.09 | 0.17 | 0.14 | 0.21 | |
SUU | 2797 | 2309 | 2506 | 2300 | 2827 | 2752 | 3151 | 3147 | 3420 | 3744 | 2565 | 3468 | 2469 | 2747 | 3154 | 381 | 1911 | 0.07 | 0.11 | 0.10 | 0.07 | 0.14 | 0.11 | 0.21 | |
KAT | 2412 | 2922 | 2724 | 2921 | 2424 | 2648 | 2207 | 2119 | 1789 | 1640 | 3069 | 2611 | 3242 | 3537 | 4204 | 4967 | 4429 | 5209 | 0.14 | 0.09 | 0.08 | 0.17 | 0.15 | 0.20 | |
KAG | 3232 | 2942 | 3446 | 3377 | 3595 | 2510 | 2721 | 3713 | 3450 | 2967 | 4289 | 1860 | 4376 | 1397 | 618 | 3893 | 4901 | 3756 | 4367 | 0.17 | 0.09 | 0.11 | 0.08 | 0.20 | |
INW | 3271 | 2880 | 2908 | 2727 | 3190 | 3419 | 3762 | 3492 | 3859 | 4320 | 2650 | 4248 | 2505 | 3635 | 4110 | 859 | 1528 | 973 | 5611 | 4718 | 0.12 | 0.21 | 0.18 | 0.28 | |
BEN | 2894 | 3424 | 2976 | 3174 | 2660 | 3487 | 3134 | 2377 | 2419 | 2806 | 2751 | 3913 | 2869 | 4631 | 5450 | 4970 | 3876 | 5307 | 1828 | 5773 | 5347 | 0.12 | 0.10 | 0.12 | |
KAN | 3991 | 3725 | 4223 | 4161 | 4357 | 3270 | 3444 | 4456 | 4160 | 3619 | 5073 | 2532 | 5163 | 2185 | 1390 | 4621 | 5685 | 4452 | 4880 | 791 | 5424 | 6410 | 0.09 | 0.24 | |
UTS | 4249 | 3965 | 4470 | 4399 | 4613 | 3527 | 3715 | 4722 | 4435 | 3902 | 5311 | 2811 | 5396 | 2418 | 1590 | 4755 | 5874 | 4566 | 5167 | 1023 | 5539 | 6696 | 288 | 0.21 | |
KUN | 3582 | 4070 | 3572 | 3749 | 3288 | 4241 | 3934 | 3052 | 3202 | 3681 | 3124 | 4777 | 3196 | 5414 | 6262 | 5293 | 3986 | 5658 | 2779 | 6631 | 5513 | 957 | 7300 | 7583 |
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6247934 333.7 11285547 602.8 NA 10548578 563.4
## Vcells 10497484 80.1 18053012 137.8 32768 14977146 114.3
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.01 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.01 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.01.log
## 21095 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.969806.
## 21095 variants and 230 people pass filters and QC.
Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations
r2_0.01 <-
read.PLINK(
here(
"output", "fst", "r2_0.01.raw"
),
quiet = FALSE,
chunkSize = 1000,
parallel = require("parallel"),
n.cores = 4
)
summary(r2_0.01)
This chunk will take a couple minutes to run.
# convert
r2_0.01_2 <- stamppConvert(r2_0.01, type="genlight")
# run stampp. If you want to runn with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
r2_0.01_3 <- stamppFst(r2_0.01_2, 1, 95, 1)
Save it
To load it
Now lets look at the object
## OKI HAI YUN KAT
## Min. :0.07411 Min. :0.03305 Min. :-0.0000684 Min. :0.05983
## 1st Qu.:0.12457 1st Qu.:0.05357 1st Qu.: 0.0202758 1st Qu.:0.09938
## Median :0.13500 Median :0.08638 Median : 0.0789742 Median :0.11057
## Mean :0.15247 Mean :0.09338 Mean : 0.0821429 Mean :0.14004
## 3rd Qu.:0.17245 3rd Qu.:0.13393 3rd Qu.: 0.1383007 3rd Qu.:0.17033
## Max. :0.27404 Max. :0.19267 Max. : 0.1838969 Max. :0.25614
## NA's :1 NA's :2 NA's :3 NA's :4
## TAI HUN HAN KLP
## Min. :0.1293 Min. :0.04315 Min. :0.03769 Min. :0.05066
## 1st Qu.:0.1818 1st Qu.:0.07836 1st Qu.:0.05082 1st Qu.:0.06001
## Median :0.1963 Median :0.08788 Median :0.09712 Median :0.13557
## Mean :0.2157 Mean :0.10679 Mean :0.10658 Mean :0.13089
## 3rd Qu.:0.2423 3rd Qu.:0.12706 3rd Qu.:0.15227 3rd Qu.:0.19086
## Max. :0.3508 Max. :0.21464 Max. :0.22468 Max. :0.23994
## NA's :5 NA's :6 NA's :7 NA's :8
## HOC QNC SSK KAC
## Min. :0.04799 Min. :0.07995 Min. :0.003314 Min. :0.001008
## 1st Qu.:0.06683 1st Qu.:0.08570 1st Qu.:0.011973 1st Qu.:0.018685
## Median :0.10139 Median :0.17159 Median :0.110002 Median :0.127829
## Mean :0.11328 Mean :0.15440 Mean :0.086747 Mean :0.096343
## 3rd Qu.:0.15323 3rd Qu.:0.21452 3rd Qu.:0.148341 3rd Qu.:0.161847
## Max. :0.21621 Max. :0.24560 Max. :0.182914 Max. :0.191394
## NA's :9 NA's :10 NA's :11 NA's :12
## CHA KAN UTS CAM
## Min. :0.003687 Min. :0.1015 Min. :0.08843 Min. :0.008604
## 1st Qu.:0.023862 1st Qu.:0.1789 1st Qu.:0.16593 1st Qu.:0.026054
## Median :0.125757 Median :0.1975 Median :0.17580 Median :0.118910
## Mean :0.097930 Mean :0.2137 Mean :0.19727 Mean :0.091140
## 3rd Qu.:0.149282 3rd Qu.:0.2658 3rd Qu.:0.23861 3rd Qu.:0.127924
## Max. :0.179347 Max. :0.3277 Max. :0.29881 Max. :0.153901
## NA's :13 NA's :14 NA's :15 NA's :16
## KAG BEN LAM SUF
## Min. :0.1453 Min. :0.02933 Min. :0.01885 Min. :0.03927
## 1st Qu.:0.1535 1st Qu.:0.07534 1st Qu.:0.10804 1st Qu.:0.08085
## Median :0.1918 Median :0.13014 Median :0.13373 Median :0.09416
## Mean :0.1996 Mean :0.10628 Mean :0.11948 Mean :0.12977
## 3rd Qu.:0.2348 3rd Qu.:0.13699 3rd Qu.:0.15615 3rd Qu.:0.14373
## Max. :0.2757 Max. :0.15984 Max. :0.16749 Max. :0.29085
## NA's :17 NA's :18 NA's :19 NA's :20
## SUU KUN INW INJ
## Min. :0.08632 Min. :0.1695 Min. :0.06375 Min. :0.1298
## 1st Qu.:0.09984 1st Qu.:0.2147 1st Qu.:0.08902 1st Qu.:0.1298
## Median :0.11444 Median :0.2599 Median :0.11429 Median :0.1298
## Mean :0.14125 Mean :0.2548 Mean :0.11429 Mean :0.1298
## 3rd Qu.:0.15586 3rd Qu.:0.2974 3rd Qu.:0.13955 3rd Qu.:0.1298
## Max. :0.24981 Max. :0.3349 Max. :0.16482 Max. :0.1298
## NA's :21 NA's :22 NA's :23 NA's :24
## MAT
## Min. : NA
## 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :25
If you want you can save the fst values as csv.
# Convert to data frame
r2_0.01_df <- data.frame(r2_0.01_3)
# Save it
write.csv(r2_0.01_df, file = here("output", "fst", "r2_0.01_df.csv"))
Check the Fst values
## OKI HAI YUN KAT TAI HUN HAN KLP HOC QNC
## OKI NA NA NA NA NA NA NA NA NA NA
## HAI 0.08801929 NA NA NA NA NA NA NA NA NA
## YUN 0.12717041 0.05389545 NA NA NA NA NA NA NA NA
## KAT 0.19706597 0.13069201 0.09803865 NA NA NA NA NA NA NA
## TAI 0.16437593 0.13725088 0.18359315 0.2561412 NA NA NA NA NA NA
## HUN 0.07411078 0.03611331 0.07708983 0.1485081 0.1292503 NA NA NA NA NA
## SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## HAI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## YUN NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## KAT NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## TAI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## HUN NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Now lets get the Fst values from the object albo3. It has the bootstraps, CI limits, p-values, and Fst values. We will convert the data into a matrix.
## OKI HAI YUN KAT TAI HUN HAN
## OKI NA 0.08801929 0.12717041 0.19706597 0.1643759 0.07411078 0.11064999
## HAI 0.08801929 NA 0.05389545 0.13069201 0.1372509 0.03611331 0.03832073
## YUN 0.12717041 0.05389545 NA 0.09803865 0.1835931 0.07708983 0.04600133
## KAT 0.19706597 0.13069201 0.09803865 NA 0.2561412 0.14850811 0.13423035
## TAI 0.16437593 0.13725088 0.18359315 0.25614121 NA 0.12925031 0.17880081
## HUN 0.07411078 0.03611331 0.07708983 0.14850811 0.1292503 NA 0.05046773
## KLP HOC QNC SSK KAC CHA
## OKI 0.16656985 0.09549009 0.19007520 0.12817076 1.311262e-01 0.125744140
## HAI 0.08638337 0.03305455 0.12288092 0.05630853 5.254290e-02 0.053245910
## YUN 0.05670882 0.06185930 0.08085858 0.00366398 -6.838534e-05 0.001604481
## KAT 0.14708471 0.14244369 0.17033112 0.09937748 1.006626e-01 0.097234843
## TAI 0.23998253 0.15425313 0.24922008 0.18283845 1.967321e-01 0.178141681
## HUN 0.10841416 0.04314995 0.14309475 0.08054483 7.900004e-02 0.077728277
## KAN UTS CAM KAG BEN LAM
## OKI 0.1634040 0.13493024 0.114640596 0.12106263 0.14616521 0.132969333
## HAI 0.1371587 0.11075509 0.043819388 0.09793452 0.07341023 0.058877477
## YUN 0.1838969 0.15870552 0.003279041 0.13988820 0.02448527 0.006938259
## KAT 0.2486329 0.22282018 0.095547956 0.20548615 0.10205812 0.103613929
## TAI 0.2220346 0.19580993 0.169327020 0.18371222 0.19967862 0.191790397
## HUN 0.1110223 0.08527642 0.066905755 0.07356335 0.09705144 0.084689086
## SUF SUU KUN INW INJ MAT
## OKI 0.21643883 0.15398736 0.2740437 0.2564655 0.21162371 0.13506645
## HAI 0.14879059 0.08892088 0.1926687 0.1850661 0.14633947 0.06332853
## YUN 0.13353818 0.09595000 0.1591835 0.1508741 0.12218318 0.01887260
## KAT 0.05982659 0.06964651 0.2290544 0.1035899 0.09395829 0.11057327
## TAI 0.28326958 0.21716118 0.3507787 0.3354167 0.26762867 0.18747764
## HUN 0.16381699 0.10041490 0.2146355 0.2000360 0.16129956 0.08787756
Import sample locations
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Arrange by region
sampling_loc <- sampling_loc |>
dplyr::arrange(
Region2, Country
)
# Check it
head(sampling_loc)
## # A tibble: 6 × 7
## Pop_City Country Latitude Longitude Region Abbreviation Region2
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 Hainan China 19.2 110. Asia HAI East Asia
## 2 Yunnan China 24.5 101. Asia YUN East Asia
## 3 Hunan China 27.6 112. Asia HUN East Asia
## 4 Okinawa Japan 26.5 128. Asia OKI East Asia
## 5 Sendai Japan 38.3 141. Asia SEN East Asia
## 6 Nagasaki Japan 32.8 130. Asia NAG East Asia
Order
## [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"
Create vector with order of populations
# Extract the populations that appear in neutral_df
populations_in_r2_0.01 <- colnames(r2_0.01_df)
# Reorder the populations based on order_pops
poporder <- populations_in_r2_0.01[populations_in_r2_0.01 %in% order_pops]
# Print the reordered populations
print(poporder)
## [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"
Lets check if the matrix is symmetric.
## [1] TRUE
Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.
Now we have to convert the matrix to a data frame to plot it with the ggplot.
## Var1 Var2 value
## OKI : 25 OKI : 25 Min. :-0.0001
## HAI : 25 HAI : 25 1st Qu.: 0.0797
## YUN : 25 YUN : 25 Median : 0.1309
## KAT : 25 KAT : 25 Mean : 0.1312
## TAI : 25 TAI : 25 3rd Qu.: 0.1735
## HUN : 25 HUN : 25 Max. : 0.3508
## (Other):475 (Other):475 NA's :325
Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.
pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient(
low = "white",
high = "#71b6ff",
name = "Fst",
na.value = "white",
limits = c(0, 0.5)
) +
scale_x_discrete(position = "top") +
theme_bw() +
geom_text(aes(label = ifelse(
is.na(value), "", formatC(value, digits = 2, format = "f")
)), size = 3) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(hjust = 0)
)
pairfst.f
ggsave(
filename = here("output", "fst", "fst_matrix_r2_0.01.pdf"),
pairfst.f,
width = 10,
height = 10,
units = "in"
)
By country
# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)
# Step 2: Calculate mean Fst for each pair of countries
# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(r2_0.01_df))
fst_df$Abbreviation1 <- rownames(fst_df)
# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)
# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))
# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>%
group_by(Country_1, Country_2) %>%
summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>%
filter(Country_1 != Country_2)
# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)
# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)
for(i in 1:nrow(fst_matrix_summary)) {
for(j in i:nrow(fst_matrix_summary)) {
if (i == j) {
symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
} else {
avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
symmetric_fst_matrix[i, j] <- avg_value
symmetric_fst_matrix[j, i] <- avg_value
}
}
}
# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))
# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
## Cambodia China India Indonesia Japan Malaysia
## Cambodia NA 0.03800139 0.02605433 0.12095720 0.1346749 0.03574590
## China 0.038001395 NA 0.06498231 0.14143583 0.1092279 0.07026417
## India 0.026054327 0.06498231 NA 0.13584651 0.1678509 0.05980367
## Indonesia 0.120957203 0.14143583 0.13584651 NA 0.2313825 0.15803996
## Japan 0.134674917 0.10922792 0.16785086 0.23138254 NA 0.18141023
## Malaysia 0.035745900 0.07026417 0.05980367 0.15803996 0.1814102 NA
## Maldives 0.153900858 0.18882922 0.13014258 0.28386514 0.2940620 0.20471972
## Nepal 0.095547956 0.13143672 0.10205812 0.08175532 0.2113562 0.12882899
## Taiwan 0.169327020 0.14483616 0.19967862 0.27586902 0.1824474 0.21373008
## Thailand 0.006331097 0.04625628 0.02756166 0.12861509 0.1536568 0.03828791
## Vietnam 0.055359685 0.06885421 0.08411290 0.17209901 0.1477431 0.09951772
## Maldives Nepal Taiwan Thailand Vietnam
## Cambodia 0.1539009 0.09554796 0.1693270 0.006331097 0.05535968
## China 0.1888292 0.13143672 0.1448362 0.046256283 0.06885421
## India 0.1301426 0.10205812 0.1996786 0.027561664 0.08411290
## Indonesia 0.2838651 0.08175532 0.2758690 0.128615090 0.17209901
## Japan 0.2940620 0.21135619 0.1824474 0.153656777 0.14774309
## Malaysia 0.2047197 0.12882899 0.2137301 0.038287915 0.09951772
## Maldives NA 0.22905444 0.3507787 0.162330775 0.22600173
## Nepal 0.2290544 NA 0.2561412 0.100222216 0.14900172
## Taiwan 0.3507787 0.25614121 NA 0.187375652 0.19409134
## Thailand 0.1623308 0.10022222 0.1873757 NA 0.06532773
## Vietnam 0.2260017 0.14900172 0.1940913 0.065327725 NA
## Cambodia China India Indonesia Japan Malaysia
## Cambodia NA 0.03800139 0.02605433 0.1209572 0.1346749 0.03574590
## China NA NA 0.06498231 0.1414358 0.1092279 0.07026417
## India NA NA NA 0.1358465 0.1678509 0.05980367
## Indonesia NA NA NA NA 0.2313825 0.15803996
## Japan NA NA NA NA NA 0.18141023
## Malaysia NA NA NA NA NA NA
## Maldives NA NA NA NA NA NA
## Nepal NA NA NA NA NA NA
## Taiwan NA NA NA NA NA NA
## Thailand NA NA NA NA NA NA
## Vietnam NA NA NA NA NA NA
## Maldives Nepal Taiwan Thailand Vietnam
## Cambodia 0.1539009 0.09554796 0.1693270 0.006331097 0.05535968
## China 0.1888292 0.13143672 0.1448362 0.046256283 0.06885421
## India 0.1301426 0.10205812 0.1996786 0.027561664 0.08411290
## Indonesia 0.2838651 0.08175532 0.2758690 0.128615090 0.17209901
## Japan 0.2940620 0.21135619 0.1824474 0.153656777 0.14774309
## Malaysia 0.2047197 0.12882899 0.2137301 0.038287915 0.09951772
## Maldives NA 0.22905444 0.3507787 0.162330775 0.22600173
## Nepal NA NA 0.2561412 0.100222216 0.14900172
## Taiwan NA NA NA 0.187375652 0.19409134
## Thailand NA NA NA NA 0.06532773
## Vietnam NA NA NA NA NA
Now we have to convert the matrix to a data frame to plot it with ggplot.
## Var1 Var2 value
## Cambodia :11 Cambodia :11 Min. :0.00633
## China :11 China :11 1st Qu.:0.08293
## India :11 India :11 Median :0.14144
## Indonesia:11 Indonesia:11 Mean :0.14212
## Japan :11 Japan :11 3rd Qu.:0.18810
## Malaysia :11 Malaysia :11 Max. :0.35078
## (Other) :55 (Other) :55 NA's :66
You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.
pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient(
low = "white",
high = "pink",
name = "Fst",
na.value = "white",
limits = c(0, 0.5)
) +
scale_x_discrete(position = "top") +
theme_bw() +
geom_text(aes(label = ifelse(
is.na(value), "", formatC(value, digits = 2, format = "f")
)), size = 3) +
theme(
axis.text.x = element_text(angle = 90, hjust = 0),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(hjust = 1)
)
pairfst.f2
ggsave(
filename = here("output", "fst", "fst_matrix_r2_0.01_by_country.pdf"),
pairfst.f2,
width = 6,
height = 5,
units = "in"
)
Remove NAs and rename columns
# remove NAs
fst2 <-
pairfst.long |>
drop_na()
# rename columns
fst2 <-
fst2 |>
dplyr::rename(pop1 = 1,
pop2 = 2,
fst = 3)
# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
dplyr::select(pop = pop2, fst)
# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)
# Calculate the mean fst for each population
mean_fst <- df_combined |>
group_by(pop) |>
summarise(mean_fst = mean(fst))
print(mean_fst)
## # A tibble: 25 × 2
## pop mean_fst
## <fct> <dbl>
## 1 OKI 0.152
## 2 HAI 0.0932
## 3 YUN 0.0828
## 4 KAT 0.140
## 5 TAI 0.211
## 6 HUN 0.104
## 7 HAN 0.103
## 8 KLP 0.130
## 9 HOC 0.104
## 10 QNC 0.153
## # ℹ 15 more rows
Merge
fst3 <-
sampling_loc |>
left_join(
mean_fst,
by = c("Abbreviation" = "pop")
) |>
drop_na() |>
dplyr::select(
-Region
)
# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)
# Rename the Region2 column to Region
fst3 <- fst3 |>
dplyr::rename(Region = Region2)
# check output
head(fst3)
## # A tibble: 6 × 7
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0932
## 2 Yunnan China 24.5 101. YUN East 0.0828
## 3 Hunan China 27.6 112. HUN East 0.104
## 4 Okinawa Japan 26.5 128. OKI East 0.152
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.173
Mean by region
# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
group_by(Region) |>
summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
left_join(region_means, by = "Region")
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 8
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0932
## 2 Yunnan China 24.5 101. YUN East 0.0828
## 3 Hunan China 27.6 112. HUN East 0.104
## 4 Okinawa Japan 26.5 128. OKI East 0.152
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.173
## 7 Kagoshima Japan 31.6 131. KAG East 0.157
## 8 Tainan Taiwan 23.0 120. TAI East 0.211
## 9 Bengaluru India 13.0 77.6 BEN South 0.0978
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.228
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>
Mean by country
# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
group_by(Country) |>
summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
left_join(country_means, by = "Country")
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 9
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0932
## 2 Yunnan China 24.5 101. YUN East 0.0828
## 3 Hunan China 27.6 112. HUN East 0.104
## 4 Okinawa Japan 26.5 128. OKI East 0.152
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.173
## 7 Kagoshima Japan 31.6 131. KAG East 0.157
## 8 Tainan Taiwan 23.0 120. TAI East 0.211
## 9 Bengaluru India 13.0 77.6 BEN South 0.0978
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.228
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>
Mean by latitude
# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))
# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
group_by(Latitude_group) |>
summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
left_join(summary_by_latitude, by = "Latitude_group")
# Rename columns
fst3 <- fst3 |>
dplyr::rename(
City = Pop_City
)
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 11
## City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0932
## 2 Yunnan China 24.5 101. YUN East 0.0828
## 3 Hunan China 27.6 112. HUN East 0.104
## 4 Okinawa Japan 26.5 128. OKI East 0.152
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.173
## 7 Kagoshima Japan 31.6 131. KAG East 0.157
## 8 Tainan Taiwan 23.0 120. TAI East 0.211
## 9 Bengaluru India 13.0 77.6 BEN South 0.0978
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.228
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## # Latitude_group <chr>, mean_fst_by_latitude <dbl>
fst4 <- fst3 |>
dplyr::select(
Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
)
fst4 <- fst4 |>
arrange(
Latitude_group, Region, Country, City
)
# Round
fst4 <- fst4 |>
mutate_if(is.numeric, ~ round(., 2))
head(fst4)
## # A tibble: 6 × 9
## Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
## <chr> <dbl> <chr> <dbl> <chr>
## 1 Above 30N 0.18 East 0.15 Japan
## 2 Above 30N 0.18 East 0.15 Japan
## 3 Above 30N 0.18 East 0.15 Japan
## 4 Below 30N 0.13 East 0.15 China
## 5 Below 30N 0.13 East 0.15 China
## 6 Below 30N 0.13 East 0.15 China
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## # Abbreviation <chr>, mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
font.family = "Arial",
font.size = 9,
big.mark = ",",
theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)
# Then create the flextable object
flex_table <- flextable(fst4) |>
set_caption(caption = as_paragraph(
as_chunk(
"Table 1. Fst values using SNPs after prunning with r2 0.01.",
props = fp_text_default(color = "#000000", font.size = 14)
)
),
fp_p = fp_par(text.align = "center", padding = 5))
# Print the flextable
flex_table
Latitude_group | mean_fst_by_latitude | Region | mean_fst_by_region | Country | mean_fst_by_country | City | Abbreviation | mean_fst |
---|---|---|---|---|---|---|---|---|
Above 30N | 0.18 | East | 0.15 | Japan | 0.17 | Kagoshima | KAG | 0.16 |
Above 30N | 0.18 | East | 0.15 | Japan | 0.17 | Kanazawa | KAN | 0.20 |
Above 30N | 0.18 | East | 0.15 | Japan | 0.17 | Utsunomiya | UTS | 0.17 |
Below 30N | 0.13 | East | 0.15 | China | 0.09 | Hainan | HAI | 0.09 |
Below 30N | 0.13 | East | 0.15 | China | 0.09 | Hunan | HUN | 0.10 |
Below 30N | 0.13 | East | 0.15 | China | 0.09 | Yunnan | YUN | 0.08 |
Below 30N | 0.13 | East | 0.15 | Japan | 0.17 | Okinawa | OKI | 0.15 |
Below 30N | 0.13 | East | 0.15 | Taiwan | 0.21 | Tainan | TAI | 0.21 |
Below 30N | 0.13 | South | 0.16 | India | 0.10 | Bengaluru | BEN | 0.10 |
Below 30N | 0.13 | South | 0.16 | Maldives | 0.23 | Kunfunadhoo | KUN | 0.23 |
Below 30N | 0.13 | South | 0.16 | Nepal | 0.14 | Kathmandu | KAT | 0.14 |
Below 30N | 0.13 | Southeast | 0.12 | Cambodia | 0.08 | Phnom Penh | CAM | 0.08 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.16 | Jakarta | INJ | 0.16 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.16 | Sulawesi (Forest) | SUF | 0.16 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.16 | Sulawesi (Urban) | SUU | 0.12 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.16 | Wainyapu | INW | 0.19 |
Below 30N | 0.13 | Southeast | 0.12 | Malaysia | 0.11 | Kuala Lumpur | KLP | 0.13 |
Below 30N | 0.13 | Southeast | 0.12 | Malaysia | 0.11 | Tambun | MAT | 0.09 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.09 | Chanthaburi | CHA | 0.08 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.09 | Kanchanaburi | KAC | 0.09 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.09 | Lampang | LAM | 0.09 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.09 | Sisaket | SSK | 0.08 |
Below 30N | 0.13 | Southeast | 0.12 | Vietnam | 0.12 | Hanoi | HAN | 0.10 |
Below 30N | 0.13 | Southeast | 0.12 | Vietnam | 0.12 | Ho Chi Minh City | HOC | 0.10 |
Below 30N | 0.13 | Southeast | 0.12 | Vietnam | 0.12 | Quy Nhon City | QNC | 0.15 |
# Initialize Word document
doc <-
read_docx() |>
body_add_flextable(value = flex_table)
# Define the output path with 'here' library
output_path <- here(
"output",
"fst",
"fst_r2_0.01_SNPS.docx"
)
# Save the Word document
print(doc, target = output_path)
To make scatter plot
# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
dplyr::group_by(Country) |>
dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))
# save the data
saveRDS(aggregated_data, here(
"output", "fst", "r2_0.01_country.rds"
))
# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]
# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)
# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)
# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)
ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
geom_point(aes(color = Country), size = 3) +
geom_line(aes(y = fitted_values), color = "blue") + # Fitted line
labs(
title = "Mean Fst by Country",
x = "Ordered Countries",
y = "Mean Fst Value"
) +
scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none")
Estimate distances
# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)
# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)
# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation
# Calculate the distances
for (i in 1:n) {
for (j in 1:n) {
if (i != j) {
coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
}
}
}
# Print the distance matrix
head(distance_matrix)
## HAI YUN HUN OKI KAN UTS KAG
## HAI 0.0000 1035.308 965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081 0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN 965.7432 1107.931 0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283 0.000 1390.3468 1589.7383 617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347 0.0000 288.4938 790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738 288.4938 0.0000 1023.2713
## TAI BEN KUN KAT CAM SUF SUU INW
## HAI 1180.1356 3487.311 4241.329 2648.016 987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN 985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI 873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
## INJ KLP MAT SSK KAC CHA LAM HAN
## HAI 2844.575 1984.417 1861.932 722.0761 1218.270 1086.825 1070.0656 441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303 714.4744 601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029 953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
## HOC QNC
## HAI 990.911 604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166
Compare distance and FST
# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]
# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0
# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))
# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))
head(data)
## Distance FST row_index col_index
## 1 0.0000 0.00000000 HAI HAI
## 2 1035.3081 0.08801929 HAI YUN
## 3 965.7432 0.12717041 HAI HUN
## 4 2047.0214 0.19706597 HAI OKI
## 5 3269.9948 0.16437593 HAI KAN
## 6 3527.0034 0.07411078 HAI UTS
Fit linear regression
# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]
# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))
# Plot
ggplot(data, aes(x = Distance, y = FST)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
annotate("text", x = max(data$Distance) * 0.85, y = max(data$FST) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
labs(title = "FST vs Distance - All populations",
x = "Distance (Km)",
y = "FST") +
scale_x_continuous(labels = scales::comma) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
ggsave(
filename = here("output", "fst", "fst_by_distance_all_samples_LD1.pdf"),
width = 6,
height = 4,
units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'
Subset by country Select countries with at least 3 sampling localities
countries_with_3_pops <- filtered_sampling_loc %>%
group_by(Country) %>%
filter(n() >= 3) %>%
pull(Country) %>%
unique()
countries_with_3_pops
## [1] "China" "Japan" "Indonesia" "Thailand" "Vietnam"
Do test for each country
results <- list()
for (country in countries_with_3_pops) {
# Extract abbreviations for the country
abbreviations <- filtered_sampling_loc %>%
filter(Country == country) %>%
pull(Abbreviation)
# Subset the data
subset_data <- data %>%
filter(row_index %in% abbreviations & col_index %in% abbreviations)
subset_data <- subset_data[subset_data$Distance > 0, ]
# Perform linear regression
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
results[[country]] <- list(
equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
)
}
results
## $China
## $China$equation
## [1] "y = -0.64629x + 4.586"
##
## $China$r2
## [1] "R^2 = 1.00"
##
##
## $Japan
## $Japan$equation
## [1] "y = 0.04826x + -0.142"
##
## $Japan$r2
## [1] "R^2 = 0.10"
##
##
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.05298x + 0.518"
##
## $Indonesia$r2
## [1] "R^2 = 0.15"
##
##
## $Thailand
## $Thailand$equation
## [1] "y = 0.29122x + -1.590"
##
## $Thailand$r2
## [1] "R^2 = 0.40"
##
##
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.05230x + 0.485"
##
## $Vietnam$r2
## [1] "R^2 = 0.16"
Merge the data
data_merged <- data %>%
left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
rename(Country1 = Country) %>%
left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
dplyr::select(-Pop_City.x, -Pop_City.y) %>%
filter(Country1 == Country) # Ensures the data is within the same country
# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")
# Filter
data_filtered <- data_merged %>%
group_by(Country1) %>%
filter(n() >= 3 & Country1 %in% countries_to_include) %>%
ungroup()
Calculate the linear regression for each country
regression_results <- data_filtered %>%
group_by(Country1) %>%
do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
rowwise() %>%
mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))
Plot it
ggplot(data_filtered, aes(x = Distance, y = FST)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ Country1, scales = "free", ncol = 2) +
geom_text(
data = regression_results,
aes(label = paste(equation, r2, sep = "~~")),
x = Inf, y = Inf,
vjust = 2, hjust = 1.2,
size = 3.5,
parse = TRUE,
inherit.aes = FALSE
) +
labs(title = "FST vs Distance by Country",
x = "Log(Distance)",
y = "FST(1-FST)") +
scale_x_continuous(labels = scales::comma) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
ggsave(
filename = here("output", "fst", "fst_by_distance_countries_LD1.pdf"),
width = 6,
height = 8,
units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'
We can merge the FST and distance matrices
# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)
# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]
# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names
# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]
# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
for(j in 1:ncol(merged_matrix)) {
if (i < j) {
# Upper triangular - Fst values with two decimal places
merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
} else if (i > j) {
# Lower triangular - Distance values with zero decimal places
merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
}
}
}
# Now the merged_matrix should be formatted as you need
print(merged_matrix)
## BEN CAM CHA HAI HAN HOC HUN INJ INW KAC
## BEN NA "0.03" "0.03" "0.07" "0.07" "0.08" "0.10" "0.13" "0.16" "0.02"
## CAM "2976" NA "0.00" "0.04" "0.04" "0.05" "0.07" "0.12" "0.15" "0.00"
## CHA "2660" "329" NA "0.05" "0.05" "0.06" "0.08" "0.12" "0.15" "0.00"
## HAI "3487" "987" "1087" NA "0.04" "0.03" "0.04" "0.15" "0.19" "0.05"
## HAN "3134" "1059" "1019" "441" NA "0.04" "0.05" "0.16" "0.21" "0.05"
## HOC "3174" "209" "537" "991" "1146" NA "0.04" "0.16" "0.20" "0.06"
## HUN "3913" "1929" "1956" "966" "954" "1954" NA "0.16" "0.20" "0.08"
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA "0.06" "0.13"
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA "0.16"
## KAC "2377" "647" "320" "1218" "1028" "856" "1981" "2393" "3492" NA
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713"
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456"
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119"
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234"
## KUN "957" "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052"
## LAM "2419" "951" "692" "1070" "733" "1139" "1637" "2844" "3859" "475"
## MAT "2751" "875" "893" "1862" "1894" "912" "2799" "1363" "2650" "1060"
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265"
## QNC "3424" "528" "782" "604" "882" "436" "1565" "2241" "2880" "1047"
## SSK "2894" "402" "368" "722" "678" "547" "1595" "2390" "3271" "530"
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859" "2874"
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973" "3147"
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986" "3562" "3635" "2397"
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722"
## YUN "2806" "1487" "1323" "1035" "601" "1627" "1108" "3467" "4320" "1179"
## KAG KAN KAT KLP KUN LAM MAT OKI QNC SSK
## BEN "0.16" "0.20" "0.10" "0.08" "0.13" "0.03" "0.04" "0.15" "0.10" "0.03"
## CAM "0.13" "0.17" "0.10" "0.05" "0.15" "0.01" "0.02" "0.11" "0.08" "0.01"
## CHA "0.14" "0.18" "0.10" "0.05" "0.15" "0.01" "0.02" "0.13" "0.08" "0.00"
## HAI "0.10" "0.14" "0.13" "0.09" "0.19" "0.06" "0.06" "0.09" "0.12" "0.06"
## HAN "0.12" "0.17" "0.13" "0.09" "0.22" "0.05" "0.06" "0.11" "0.13" "0.05"
## HOC "0.11" "0.15" "0.14" "0.10" "0.22" "0.07" "0.07" "0.10" "0.14" "0.06"
## HUN "0.07" "0.11" "0.15" "0.11" "0.21" "0.08" "0.09" "0.07" "0.14" "0.08"
## INJ "0.22" "0.26" "0.09" "0.17" "0.26" "0.13" "0.13" "0.21" "0.19" "0.12"
## INW "0.26" "0.32" "0.10" "0.23" "0.33" "0.16" "0.16" "0.26" "0.23" "0.15"
## KAC "0.14" "0.19" "0.10" "0.06" "0.17" "0.01" "0.02" "0.13" "0.08" "0.00"
## KAG NA "0.12" "0.21" "0.18" "0.28" "0.15" "0.15" "0.12" "0.20" "0.14"
## KAN "791" NA "0.25" "0.23" "0.33" "0.19" "0.19" "0.16" "0.25" "0.18"
## KAT "4367" "4880" NA "0.15" "0.23" "0.10" "0.11" "0.20" "0.17" "0.10"
## KLP "4376" "5163" "3242" NA "0.24" "0.06" "0.06" "0.17" "0.14" "0.06"
## KUN "6631" "7300" "2779" "3196" NA "0.17" "0.17" "0.27" "0.24" "0.16"
## LAM "3450" "4160" "1789" "1704" "3202" NA "0.02" "0.13" "0.09" "0.01"
## MAT "4289" "5073" "3069" "177" "3124" "1531" NA "0.14" "0.10" "0.02"
## OKI "618" "1390" "4204" "3822" "6262" "3060" "3747" NA "0.19" "0.13"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA "0.08"
## SSK "3232" "3991" "2412" "1365" "3582" "624" "1217" "2758" "548" NA
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873" "1548" "1886"
## UTS "1023" "288" "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714" "2208" "2678" "1449" "1087"
## SUF SUU TAI UTS YUN
## BEN "0.14" "0.11" "0.20" "0.17" "0.02"
## CAM "0.13" "0.09" "0.17" "0.14" "0.00"
## CHA "0.13" "0.09" "0.18" "0.16" "0.00"
## HAI "0.15" "0.09" "0.14" "0.11" "0.05"
## HAN "0.17" "0.10" "0.18" "0.14" "0.05"
## HOC "0.16" "0.10" "0.15" "0.12" "0.06"
## HUN "0.16" "0.10" "0.13" "0.09" "0.08"
## INJ "0.08" "0.09" "0.27" "0.24" "0.12"
## INW "0.09" "0.12" "0.34" "0.28" "0.15"
## KAC "0.14" "0.10" "0.20" "0.17" "-0.00"
## KAG "0.23" "0.16" "0.18" "0.09" "0.14"
## KAN "0.27" "0.21" "0.22" "0.10" "0.18"
## KAT "0.06" "0.07" "0.26" "0.22" "0.10"
## KLP "0.19" "0.14" "0.24" "0.20" "0.06"
## KUN "0.29" "0.25" "0.35" "0.30" "0.16"
## LAM "0.14" "0.10" "0.19" "0.17" "0.01"
## MAT "0.14" "0.10" "0.19" "0.17" "0.02"
## OKI "0.22" "0.15" "0.16" "0.13" "0.13"
## QNC "0.21" "0.17" "0.25" "0.22" "0.08"
## SSK "0.13" "0.10" "0.18" "0.16" "0.00"
## SUF NA "0.04" "0.28" "0.24" "0.13"
## SUU "381" NA "0.22" "0.18" "0.10"
## TAI "2777" "2747" NA "0.20" "0.18"
## UTS "4755" "4566" "2418" NA "0.16"
## YUN "3566" "3744" "1928" "3902" NA
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
We can sort by distance
# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)
# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))
# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]
# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]
# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
for(j in 1:ncol(merged_matrix)) {
if (i < j) {
merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
} else if (i > j) {
merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
}
}
}
# Print the merged matrix
print(merged_matrix)
## SSK QNC CAM HOC CHA HAI HAN KAC LAM YUN
## SSK NA "0.08" "0.01" "0.06" "0.00" "0.06" "0.05" "0.00" "0.01" "0.00"
## QNC "548" NA "0.08" "0.14" "0.08" "0.12" "0.13" "0.08" "0.09" "0.08"
## CAM "402" "528" NA "0.05" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## HOC "547" "436" "209" NA "0.06" "0.03" "0.04" "0.06" "0.07" "0.06"
## CHA "368" "782" "329" "537" NA "0.05" "0.05" "0.00" "0.01" "0.00"
## HAI "722" "604" "987" "991" "1087" NA "0.04" "0.05" "0.06" "0.05"
## HAN "678" "882" "1059" "1146" "1019" "441" NA "0.05" "0.05" "0.05"
## KAC "530" "1047" "647" "856" "320" "1218" "1028" NA "0.01" "0.00"
## LAM "624" "1155" "951" "1139" "692" "1070" "733" "475" NA "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601" "1179" "714" NA
## MAT "1217" "1348" "875" "912" "893" "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966" "954" "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
## MAT HUN KLP TAI OKI SUF INJ SUU KAT KAG
## SSK "0.02" "0.08" "0.06" "0.18" "0.13" "0.13" "0.12" "0.10" "0.10" "0.14"
## QNC "0.10" "0.14" "0.14" "0.25" "0.19" "0.21" "0.19" "0.17" "0.17" "0.20"
## CAM "0.02" "0.07" "0.05" "0.17" "0.11" "0.13" "0.12" "0.09" "0.10" "0.13"
## HOC "0.07" "0.04" "0.10" "0.15" "0.10" "0.16" "0.16" "0.10" "0.14" "0.11"
## CHA "0.02" "0.08" "0.05" "0.18" "0.13" "0.13" "0.12" "0.09" "0.10" "0.14"
## HAI "0.06" "0.04" "0.09" "0.14" "0.09" "0.15" "0.15" "0.09" "0.13" "0.10"
## HAN "0.06" "0.05" "0.09" "0.18" "0.11" "0.17" "0.16" "0.10" "0.13" "0.12"
## KAC "0.02" "0.08" "0.06" "0.20" "0.13" "0.14" "0.13" "0.10" "0.10" "0.14"
## LAM "0.02" "0.08" "0.06" "0.19" "0.13" "0.14" "0.13" "0.10" "0.10" "0.15"
## YUN "0.02" "0.08" "0.06" "0.18" "0.13" "0.13" "0.12" "0.10" "0.10" "0.14"
## MAT NA "0.09" "0.06" "0.19" "0.14" "0.14" "0.13" "0.10" "0.11" "0.15"
## HUN "2799" NA "0.11" "0.13" "0.07" "0.16" "0.16" "0.10" "0.15" "0.07"
## KLP "177" "2932" NA "0.24" "0.17" "0.19" "0.17" "0.14" "0.15" "0.18"
## TAI "2894" "986" "2979" NA "0.16" "0.28" "0.27" "0.22" "0.26" "0.18"
## OKI "3747" "1599" "3822" "873" NA "0.22" "0.21" "0.15" "0.20" "0.12"
## SUF "2220" "3406" "2115" "2777" "3280" NA "0.08" "0.04" "0.06" "0.23"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA "0.09" "0.09" "0.22"
## SUU "2565" "3468" "2469" "2747" "3154" "381" "1911" NA "0.07" "0.16"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA "0.21"
## KAG "4289" "1860" "4376" "1397" "618" "3893" "4901" "3756" "4367" NA
## INW "2650" "4248" "2505" "3635" "4110" "859" "1528" "973" "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791"
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
## INW BEN KAN UTS KUN
## SSK "0.15" "0.03" "0.18" "0.16" "0.16"
## QNC "0.23" "0.10" "0.25" "0.22" "0.24"
## CAM "0.15" "0.03" "0.17" "0.14" "0.15"
## HOC "0.20" "0.08" "0.15" "0.12" "0.22"
## CHA "0.15" "0.03" "0.18" "0.16" "0.15"
## HAI "0.19" "0.07" "0.14" "0.11" "0.19"
## HAN "0.21" "0.07" "0.17" "0.14" "0.22"
## KAC "0.16" "0.02" "0.19" "0.17" "0.17"
## LAM "0.16" "0.03" "0.19" "0.17" "0.17"
## YUN "0.15" "0.02" "0.18" "0.16" "0.16"
## MAT "0.16" "0.04" "0.19" "0.17" "0.17"
## HUN "0.20" "0.10" "0.11" "0.09" "0.21"
## KLP "0.23" "0.08" "0.23" "0.20" "0.24"
## TAI "0.34" "0.20" "0.22" "0.20" "0.35"
## OKI "0.26" "0.15" "0.16" "0.13" "0.27"
## SUF "0.09" "0.14" "0.27" "0.24" "0.29"
## INJ "0.06" "0.13" "0.26" "0.24" "0.26"
## SUU "0.12" "0.11" "0.21" "0.18" "0.25"
## KAT "0.10" "0.10" "0.25" "0.22" "0.23"
## KAG "0.26" "0.16" "0.12" "0.09" "0.28"
## INW NA "0.16" "0.32" "0.28" "0.33"
## BEN "5347" NA "0.20" "0.17" "0.13"
## KAN "5424" "6410" NA "0.10" "0.33"
## UTS "5539" "6696" "288" NA "0.30"
## KUN "5513" "957" "7300" "7583" NA
Make a table and save as word document
# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)
# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]
# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))
ft
Population | SSK | QNC | CAM | HOC | CHA | HAI | HAN | KAC | LAM | YUN | MAT | HUN | KLP | TAI | OKI | SUF | INJ | SUU | KAT | KAG | INW | BEN | KAN | UTS | KUN |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SSK | 0.08 | 0.01 | 0.06 | 0.00 | 0.06 | 0.05 | 0.00 | 0.01 | 0.00 | 0.02 | 0.08 | 0.06 | 0.18 | 0.13 | 0.13 | 0.12 | 0.10 | 0.10 | 0.14 | 0.15 | 0.03 | 0.18 | 0.16 | 0.16 | |
QNC | 548 | 0.08 | 0.14 | 0.08 | 0.12 | 0.13 | 0.08 | 0.09 | 0.08 | 0.10 | 0.14 | 0.14 | 0.25 | 0.19 | 0.21 | 0.19 | 0.17 | 0.17 | 0.20 | 0.23 | 0.10 | 0.25 | 0.22 | 0.24 | |
CAM | 402 | 528 | 0.05 | 0.00 | 0.04 | 0.04 | 0.00 | 0.01 | 0.00 | 0.02 | 0.07 | 0.05 | 0.17 | 0.11 | 0.13 | 0.12 | 0.09 | 0.10 | 0.13 | 0.15 | 0.03 | 0.17 | 0.14 | 0.15 | |
HOC | 547 | 436 | 209 | 0.06 | 0.03 | 0.04 | 0.06 | 0.07 | 0.06 | 0.07 | 0.04 | 0.10 | 0.15 | 0.10 | 0.16 | 0.16 | 0.10 | 0.14 | 0.11 | 0.20 | 0.08 | 0.15 | 0.12 | 0.22 | |
CHA | 368 | 782 | 329 | 537 | 0.05 | 0.05 | 0.00 | 0.01 | 0.00 | 0.02 | 0.08 | 0.05 | 0.18 | 0.13 | 0.13 | 0.12 | 0.09 | 0.10 | 0.14 | 0.15 | 0.03 | 0.18 | 0.16 | 0.15 | |
HAI | 722 | 604 | 987 | 991 | 1087 | 0.04 | 0.05 | 0.06 | 0.05 | 0.06 | 0.04 | 0.09 | 0.14 | 0.09 | 0.15 | 0.15 | 0.09 | 0.13 | 0.10 | 0.19 | 0.07 | 0.14 | 0.11 | 0.19 | |
HAN | 678 | 882 | 1059 | 1146 | 1019 | 441 | 0.05 | 0.05 | 0.05 | 0.06 | 0.05 | 0.09 | 0.18 | 0.11 | 0.17 | 0.16 | 0.10 | 0.13 | 0.12 | 0.21 | 0.07 | 0.17 | 0.14 | 0.22 | |
KAC | 530 | 1047 | 647 | 856 | 320 | 1218 | 1028 | 0.01 | 0.00 | 0.02 | 0.08 | 0.06 | 0.20 | 0.13 | 0.14 | 0.13 | 0.10 | 0.10 | 0.14 | 0.16 | 0.02 | 0.19 | 0.17 | 0.17 | |
LAM | 624 | 1155 | 951 | 1139 | 692 | 1070 | 733 | 475 | 0.01 | 0.02 | 0.08 | 0.06 | 0.19 | 0.13 | 0.14 | 0.13 | 0.10 | 0.10 | 0.15 | 0.16 | 0.03 | 0.19 | 0.17 | 0.17 | |
YUN | 1087 | 1449 | 1487 | 1627 | 1323 | 1035 | 601 | 1179 | 714 | 0.02 | 0.08 | 0.06 | 0.18 | 0.13 | 0.13 | 0.12 | 0.10 | 0.10 | 0.14 | 0.15 | 0.02 | 0.18 | 0.16 | 0.16 | |
MAT | 1217 | 1348 | 875 | 912 | 893 | 1862 | 1894 | 1060 | 1531 | 2208 | 0.09 | 0.06 | 0.19 | 0.14 | 0.14 | 0.13 | 0.10 | 0.11 | 0.15 | 0.16 | 0.04 | 0.19 | 0.17 | 0.17 | |
HUN | 1595 | 1565 | 1929 | 1954 | 1956 | 966 | 954 | 1981 | 1637 | 1108 | 2799 | 0.11 | 0.13 | 0.07 | 0.16 | 0.16 | 0.10 | 0.15 | 0.07 | 0.20 | 0.10 | 0.11 | 0.09 | 0.21 | |
KLP | 1365 | 1445 | 1003 | 1011 | 1055 | 1984 | 2042 | 1234 | 1704 | 2375 | 177 | 2932 | 0.24 | 0.17 | 0.19 | 0.17 | 0.14 | 0.15 | 0.18 | 0.23 | 0.08 | 0.23 | 0.20 | 0.24 | |
TAI | 1886 | 1548 | 2061 | 1982 | 2237 | 1180 | 1497 | 2397 | 2219 | 1928 | 2894 | 986 | 2979 | 0.16 | 0.28 | 0.27 | 0.22 | 0.26 | 0.18 | 0.34 | 0.20 | 0.22 | 0.20 | 0.35 | |
OKI | 2758 | 2410 | 2929 | 2839 | 3110 | 2047 | 2328 | 3265 | 3060 | 2678 | 3747 | 1599 | 3822 | 873 | 0.22 | 0.21 | 0.15 | 0.20 | 0.12 | 0.26 | 0.15 | 0.16 | 0.13 | 0.27 | |
SUF | 2567 | 2117 | 2245 | 2046 | 2557 | 2614 | 2986 | 2874 | 3182 | 3566 | 2220 | 3406 | 2115 | 2777 | 3280 | 0.08 | 0.04 | 0.06 | 0.23 | 0.09 | 0.14 | 0.27 | 0.24 | 0.29 | |
INJ | 2390 | 2241 | 1989 | 1889 | 2160 | 2845 | 3034 | 2393 | 2844 | 3467 | 1363 | 3805 | 1188 | 3562 | 4294 | 1535 | 0.09 | 0.09 | 0.22 | 0.06 | 0.13 | 0.26 | 0.24 | 0.26 | |
SUU | 2797 | 2309 | 2506 | 2300 | 2827 | 2752 | 3151 | 3147 | 3420 | 3744 | 2565 | 3468 | 2469 | 2747 | 3154 | 381 | 1911 | 0.07 | 0.16 | 0.12 | 0.11 | 0.21 | 0.18 | 0.25 | |
KAT | 2412 | 2922 | 2724 | 2921 | 2424 | 2648 | 2207 | 2119 | 1789 | 1640 | 3069 | 2611 | 3242 | 3537 | 4204 | 4967 | 4429 | 5209 | 0.21 | 0.10 | 0.10 | 0.25 | 0.22 | 0.23 | |
KAG | 3232 | 2942 | 3446 | 3377 | 3595 | 2510 | 2721 | 3713 | 3450 | 2967 | 4289 | 1860 | 4376 | 1397 | 618 | 3893 | 4901 | 3756 | 4367 | 0.26 | 0.16 | 0.12 | 0.09 | 0.28 | |
INW | 3271 | 2880 | 2908 | 2727 | 3190 | 3419 | 3762 | 3492 | 3859 | 4320 | 2650 | 4248 | 2505 | 3635 | 4110 | 859 | 1528 | 973 | 5611 | 4718 | 0.16 | 0.32 | 0.28 | 0.33 | |
BEN | 2894 | 3424 | 2976 | 3174 | 2660 | 3487 | 3134 | 2377 | 2419 | 2806 | 2751 | 3913 | 2869 | 4631 | 5450 | 4970 | 3876 | 5307 | 1828 | 5773 | 5347 | 0.20 | 0.17 | 0.13 | |
KAN | 3991 | 3725 | 4223 | 4161 | 4357 | 3270 | 3444 | 4456 | 4160 | 3619 | 5073 | 2532 | 5163 | 2185 | 1390 | 4621 | 5685 | 4452 | 4880 | 791 | 5424 | 6410 | 0.10 | 0.33 | |
UTS | 4249 | 3965 | 4470 | 4399 | 4613 | 3527 | 3715 | 4722 | 4435 | 3902 | 5311 | 2811 | 5396 | 2418 | 1590 | 4755 | 5874 | 4566 | 5167 | 1023 | 5539 | 6696 | 288 | 0.30 | |
KUN | 3582 | 4070 | 3572 | 3749 | 3288 | 4241 | 3934 | 3052 | 3202 | 3681 | 3124 | 4777 | 3196 | 5414 | 6262 | 5293 | 3986 | 5658 | 2779 | 6631 | 5513 | 957 | 7300 | 7583 |
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6244094 333.5 11233748 600.0 NA 10670513 569.9
## Vcells 10548729 80.5 18029388 137.6 32768 14957815 114.2
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.1 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.1.log
## 58152 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.972163.
## 58152 variants and 230 people pass filters and QC.
Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations
r2_0.1 <-
read.PLINK(
here(
"output", "fst", "r2_0.1.raw"
),
quiet = FALSE,
chunkSize = 1000,
parallel = require("parallel"),
n.cores = 4
)
summary(r2_0.1)
This chunk will take a couple minutes to run.
# convert
r2_0.1_2 <- stamppConvert(r2_0.1, type="genlight")
# run stampp. If you want to runn with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
r2_0.1_3 <- stamppFst(r2_0.1_2, 1, 95, 1)
Save it
To load it
Now lets look at the object
## OKI HAI YUN KAT
## Min. :0.07442 Min. :0.03415 Min. :-0.000061 Min. :0.06197
## 1st Qu.:0.11875 1st Qu.:0.04948 1st Qu.: 0.020653 1st Qu.:0.10390
## Median :0.13231 Median :0.08208 Median : 0.074156 Median :0.11475
## Mean :0.14973 Mean :0.09324 Mean : 0.081748 Mean :0.14294
## 3rd Qu.:0.17241 3rd Qu.:0.13763 3rd Qu.: 0.138696 3rd Qu.:0.17543
## Max. :0.26203 Max. :0.19423 Max. : 0.184499 Max. :0.26064
## NA's :1 NA's :2 NA's :3 NA's :4
## TAI HUN HAN KLP
## Min. :0.1335 Min. :0.04272 Min. :0.03747 Min. :0.05153
## 1st Qu.:0.1827 1st Qu.:0.07029 1st Qu.:0.04754 1st Qu.:0.05847
## Median :0.1963 Median :0.08814 Median :0.09929 Median :0.13852
## Mean :0.2179 Mean :0.10291 Mean :0.10694 Mean :0.13005
## 3rd Qu.:0.2400 3rd Qu.:0.12577 3rd Qu.:0.15623 3rd Qu.:0.18669
## Max. :0.3444 Max. :0.20465 Max. :0.22103 Max. :0.24103
## NA's :5 NA's :6 NA's :7 NA's :8
## HOC QNC SSK KAC
## Min. :0.04642 Min. :0.07948 Min. :0.002698 Min. :0.001375
## 1st Qu.:0.06326 1st Qu.:0.08570 1st Qu.:0.011550 1st Qu.:0.018868
## Median :0.10400 Median :0.17367 Median :0.113973 Median :0.132199
## Mean :0.11322 Mean :0.15507 Mean :0.087070 Mean :0.095838
## 3rd Qu.:0.15567 3rd Qu.:0.21453 3rd Qu.:0.147997 3rd Qu.:0.153453
## Max. :0.21166 Max. :0.24452 Max. :0.179032 Max. :0.184788
## NA's :9 NA's :10 NA's :11 NA's :12
## CHA KAN UTS CAM
## Min. :0.004038 Min. :0.1031 Min. :0.08997 Min. :0.008172
## 1st Qu.:0.023675 1st Qu.:0.1759 1st Qu.:0.15738 1st Qu.:0.025363
## Median :0.128982 Median :0.1946 Median :0.17412 Median :0.121331
## Mean :0.099625 Mean :0.2137 Mean :0.19413 Mean :0.094094
## 3rd Qu.:0.149675 3rd Qu.:0.2701 3rd Qu.:0.24146 3rd Qu.:0.137461
## Max. :0.177767 Max. :0.3177 Max. :0.28404 Max. :0.161974
## NA's :13 NA's :14 NA's :15 NA's :16
## KAG BEN LAM SUF
## Min. :0.1362 Min. :0.02818 Min. :0.01876 Min. :0.04022
## 1st Qu.:0.1471 1st Qu.:0.07735 1st Qu.:0.10946 1st Qu.:0.07931
## Median :0.1938 Median :0.13012 Median :0.13912 Median :0.09606
## Mean :0.1963 Mean :0.11092 Mean :0.12277 Mean :0.13239
## 3rd Qu.:0.2366 3rd Qu.:0.14495 3rd Qu.:0.16100 3rd Qu.:0.15057
## Max. :0.2646 Max. :0.17353 Max. :0.17178 Max. :0.29578
## NA's :17 NA's :18 NA's :19 NA's :20
## SUU KUN INW INJ
## Min. :0.08678 Min. :0.1681 Min. :0.06515 Min. :0.1366
## 1st Qu.:0.10211 1st Qu.:0.2155 1st Qu.:0.09282 1st Qu.:0.1366
## Median :0.11800 Median :0.2629 Median :0.12049 Median :0.1366
## Mean :0.14286 Mean :0.2578 Mean :0.12049 Mean :0.1366
## 3rd Qu.:0.15875 3rd Qu.:0.3027 3rd Qu.:0.14816 3rd Qu.:0.1366
## Max. :0.24867 Max. :0.3425 Max. :0.17582 Max. :0.1366
## NA's :21 NA's :22 NA's :23 NA's :24
## MAT
## Min. : NA
## 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :25
If you want you can save the fst values as csv.
# Convert to data frame
r2_0.1_df <- data.frame(r2_0.1_3)
# Save it
write.csv(r2_0.1_df, file = here("output", "fst", "r2_0.1_df.csv"))
Check the Fst values
## OKI HAI YUN KAT TAI HUN HAN KLP HOC QNC
## OKI NA NA NA NA NA NA NA NA NA NA
## HAI 0.08802883 NA NA NA NA NA NA NA NA NA
## YUN 0.11878133 0.04941474 NA NA NA NA NA NA NA NA
## KAT 0.19738113 0.13464192 0.10181861 NA NA NA NA NA NA NA
## TAI 0.16804359 0.14154214 0.18449852 0.2606445 NA NA NA NA NA NA
## HUN 0.07441617 0.03534541 0.06802916 0.1476221 0.1334822 NA NA NA NA NA
## SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## HAI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## YUN NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## KAT NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## TAI NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## HUN NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Now lets get the Fst values from the object albo3. It has the bootstraps, CI limits, p-values, and Fst values. We will convert the data into a matrix.
## OKI HAI YUN KAT TAI HUN HAN
## OKI NA 0.08802883 0.11878133 0.1973811 0.1680436 0.07441617 0.10763474
## HAI 0.08802883 NA 0.04941474 0.1346419 0.1415421 0.03534541 0.03825566
## YUN 0.11878133 0.04941474 NA 0.1018186 0.1844985 0.06802916 0.04338869
## KAT 0.19738113 0.13464192 0.10181861 NA 0.2606445 0.14762208 0.13924461
## TAI 0.16804359 0.14154214 0.18449852 0.2606445 NA 0.13348223 0.18073656
## HUN 0.07441617 0.03534541 0.06802916 0.1476221 0.1334822 NA 0.04712316
## KLP HOC QNC SSK KAC CHA
## OKI 0.15791274 0.09583416 0.1855034 0.120337901 1.222639e-01 0.11864070
## HAI 0.08207801 0.03414611 0.1192940 0.051748990 4.855596e-02 0.04954779
## YUN 0.05497547 0.05872326 0.0802834 0.003470757 -6.095245e-05 0.00216753
## KAT 0.14992931 0.14537279 0.1754255 0.103904742 1.043844e-01 0.10291812
## TAI 0.23665704 0.15932868 0.2500647 0.183363174 1.938385e-01 0.18037855
## HUN 0.09955817 0.04272361 0.1355892 0.070972254 6.854084e-02 0.06961833
## KAN UTS CAM KAG BEN LAM
## OKI 0.1661065 0.13562198 0.109049041 0.12082719 0.13838680 0.123949616
## HAI 0.1406218 0.11163440 0.041994919 0.09877636 0.06924561 0.053818795
## YUN 0.1797822 0.14989155 0.003581621 0.13178798 0.02349872 0.006781344
## KAT 0.2507648 0.22214457 0.101256096 0.20504141 0.10811511 0.106234969
## TAI 0.2282426 0.19867600 0.171069919 0.18782724 0.20034447 0.189514500
## HUN 0.1159453 0.08822364 0.059878361 0.07511422 0.08813833 0.073799872
## SUF SUU KUN INW INJ MAT
## OKI 0.22063500 0.15757698 0.2620293 0.2615279 0.21402287 0.12899644
## HAI 0.15560449 0.09438831 0.1867310 0.1942290 0.15201957 0.06079123
## YUN 0.14099822 0.09767201 0.1564464 0.1629609 0.12806009 0.01970472
## KAT 0.06196652 0.07006897 0.2304904 0.1073455 0.09416537 0.11474970
## TAI 0.29024752 0.22479493 0.3443908 0.3418771 0.27387778 0.18974486
## HUN 0.16664122 0.10268620 0.2017426 0.2046510 0.16392725 0.08049047
Import sample locations
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Arrange by region
sampling_loc <- sampling_loc |>
dplyr::arrange(
Region2, Country
)
# Check it
head(sampling_loc)
## # A tibble: 6 × 7
## Pop_City Country Latitude Longitude Region Abbreviation Region2
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 Hainan China 19.2 110. Asia HAI East Asia
## 2 Yunnan China 24.5 101. Asia YUN East Asia
## 3 Hunan China 27.6 112. Asia HUN East Asia
## 4 Okinawa Japan 26.5 128. Asia OKI East Asia
## 5 Sendai Japan 38.3 141. Asia SEN East Asia
## 6 Nagasaki Japan 32.8 130. Asia NAG East Asia
Order
## [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"
Create vector with order of populations
# Extract the populations that appear in neutral_df
populations_in_r2_0.1 <- colnames(r2_0.1_df)
# Reorder the populations based on order_pops
poporder <- populations_in_r2_0.1[populations_in_r2_0.1 %in% order_pops]
# Print the reordered populations
print(poporder)
## [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"
Lets check if the matrix is symmetric.
## [1] TRUE
Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.
Now we have to convert the matrix to a data frame to plot it with ggplot.
## Var1 Var2 value
## OKI : 25 OKI : 25 Min. :-0.0001
## HAI : 25 HAI : 25 1st Qu.: 0.0749
## YUN : 25 YUN : 25 Median : 0.1320
## KAT : 25 KAT : 25 Mean : 0.1313
## TAI : 25 TAI : 25 3rd Qu.: 0.1763
## HUN : 25 HUN : 25 Max. : 0.3444
## (Other):475 (Other):475 NA's :325
Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.
pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient(
low = "white",
high = "#71b6ff",
name = "Fst",
na.value = "white",
limits = c(0, 0.5)
) +
scale_x_discrete(position = "top") +
theme_bw() +
geom_text(aes(label = ifelse(
is.na(value), "", formatC(value, digits = 2, format = "f")
)), size = 3) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(hjust = 0)
)
pairfst.f
ggsave(
filename = here("output", "fst", "fst_matrix_r2_0.1.pdf"),
pairfst.f,
width = 10,
height = 10,
units = "in"
)
By country
# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)
# Step 2: Calculate mean Fst for each pair of countries
# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(r2_0.1_df))
fst_df$Abbreviation1 <- rownames(fst_df)
# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)
# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))
# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>%
group_by(Country_1, Country_2) %>%
summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>%
filter(Country_1 != Country_2)
# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)
# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)
for(i in 1:nrow(fst_matrix_summary)) {
for(j in i:nrow(fst_matrix_summary)) {
if (i == j) {
symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
} else {
avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
symmetric_fst_matrix[i, j] <- avg_value
symmetric_fst_matrix[j, i] <- avg_value
}
}
}
# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))
# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
## Cambodia China India Indonesia Japan Malaysia
## Cambodia NA 0.03515163 0.02536262 0.12941643 0.1297409 0.03646673
## China 0.035151634 NA 0.06029422 0.14698652 0.1075254 0.06626635
## India 0.025362622 0.06029422 NA 0.14413628 0.1619263 0.05902305
## Indonesia 0.129416433 0.14698652 0.14413628 NA 0.2343967 0.16434783
## Japan 0.129740854 0.10752536 0.16192628 0.23439668 NA 0.17315795
## Malaysia 0.036466732 0.06626635 0.05902305 0.16434783 0.1731579 NA
## Maldives 0.152913626 0.18164001 0.13012441 0.28745228 0.2816610 0.20454714
## Nepal 0.101256096 0.13292617 0.10811511 0.08338658 0.2116824 0.13233951
## Taiwan 0.171069919 0.14825128 0.20034447 0.28269932 0.1864794 0.21320095
## Thailand 0.006041354 0.04158013 0.02653178 0.13544451 0.1460455 0.03823662
## Vietnam 0.054734247 0.06661412 0.08278023 0.17914574 0.1451190 0.09840066
## Maldives Nepal Taiwan Thailand Vietnam
## Cambodia 0.1529136 0.10125610 0.1710699 0.006041354 0.05473425
## China 0.1816400 0.13292617 0.1482513 0.041580126 0.06661412
## India 0.1301244 0.10811511 0.2003445 0.026531776 0.08278023
## Indonesia 0.2874523 0.08338658 0.2826993 0.135444507 0.17914574
## Japan 0.2816610 0.21168235 0.1864794 0.146045512 0.14511899
## Malaysia 0.2045471 0.13233951 0.2132009 0.038236622 0.09840066
## Maldives NA 0.23049036 0.3443908 0.161208330 0.22357228
## Nepal 0.2304904 NA 0.2606445 0.104360558 0.15334765
## Taiwan 0.3443908 0.26064445 NA 0.186773674 0.19670999
## Thailand 0.1612083 0.10436056 0.1867737 NA 0.06323481
## Vietnam 0.2235723 0.15334765 0.1967100 0.063234806 NA
## Cambodia China India Indonesia Japan Malaysia
## Cambodia NA 0.03515163 0.02536262 0.1294164 0.1297409 0.03646673
## China NA NA 0.06029422 0.1469865 0.1075254 0.06626635
## India NA NA NA 0.1441363 0.1619263 0.05902305
## Indonesia NA NA NA NA 0.2343967 0.16434783
## Japan NA NA NA NA NA 0.17315795
## Malaysia NA NA NA NA NA NA
## Maldives NA NA NA NA NA NA
## Nepal NA NA NA NA NA NA
## Taiwan NA NA NA NA NA NA
## Thailand NA NA NA NA NA NA
## Vietnam NA NA NA NA NA NA
## Maldives Nepal Taiwan Thailand Vietnam
## Cambodia 0.1529136 0.10125610 0.1710699 0.006041354 0.05473425
## China 0.1816400 0.13292617 0.1482513 0.041580126 0.06661412
## India 0.1301244 0.10811511 0.2003445 0.026531776 0.08278023
## Indonesia 0.2874523 0.08338658 0.2826993 0.135444507 0.17914574
## Japan 0.2816610 0.21168235 0.1864794 0.146045512 0.14511899
## Malaysia 0.2045471 0.13233951 0.2132009 0.038236622 0.09840066
## Maldives NA 0.23049036 0.3443908 0.161208330 0.22357228
## Nepal NA NA 0.2606445 0.104360558 0.15334765
## Taiwan NA NA NA 0.186773674 0.19670999
## Thailand NA NA NA NA 0.06323481
## Vietnam NA NA NA NA NA
Now we have to convert the matrix to a data frame to plot it with ggplot.
## Var1 Var2 value
## Cambodia :11 Cambodia :11 Min. :0.00604
## China :11 China :11 1st Qu.:0.08308
## India :11 India :11 Median :0.14512
## Indonesia:11 Indonesia:11 Mean :0.14236
## Japan :11 Japan :11 3rd Qu.:0.18663
## Malaysia :11 Malaysia :11 Max. :0.34439
## (Other) :55 (Other) :55 NA's :66
You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.
pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient(
low = "white",
high = "orange",
name = "Fst",
na.value = "white",
limits = c(0, 0.5)
) +
scale_x_discrete(position = "top") +
theme_bw() +
geom_text(aes(label = ifelse(
is.na(value), "", formatC(value, digits = 2, format = "f")
)), size = 3) +
theme(
axis.text.x = element_text(angle = 90, hjust = 0),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(hjust = 1)
)
pairfst.f2
ggsave(
filename = here("output", "fst", "fst_matrix_r2_0.1_by_country.pdf"),
pairfst.f2,
width = 6,
height = 5,
units = "in"
)
Remove NAs and rename columns
# remove NAs
fst2 <-
pairfst.long |>
drop_na()
# rename columns
fst2 <-
fst2 |>
dplyr::rename(pop1 = 1,
pop2 = 2,
fst = 3)
# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
dplyr::select(pop = pop2, fst)
# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)
# Calculate the mean fst for each population
mean_fst <- df_combined |>
group_by(pop) |>
summarise(mean_fst = mean(fst))
print(mean_fst)
## # A tibble: 25 × 2
## pop mean_fst
## <fct> <dbl>
## 1 OKI 0.150
## 2 HAI 0.0930
## 3 YUN 0.0819
## 4 KAT 0.143
## 5 TAI 0.213
## 6 HUN 0.101
## 7 HAN 0.103
## 8 KLP 0.128
## 9 HOC 0.104
## 10 QNC 0.153
## # ℹ 15 more rows
Merge
fst3 <-
sampling_loc |>
left_join(
mean_fst,
by = c("Abbreviation" = "pop")
) |>
drop_na() |>
dplyr::select(
-Region
)
# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)
# Rename the Region2 column to Region
fst3 <- fst3 |>
dplyr::rename(Region = Region2)
# check output
head(fst3)
## # A tibble: 6 × 7
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0930
## 2 Yunnan China 24.5 101. YUN East 0.0819
## 3 Hunan China 27.6 112. HUN East 0.101
## 4 Okinawa Japan 26.5 128. OKI East 0.150
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.169
Mean by region
# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
group_by(Region) |>
summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
left_join(region_means, by = "Region")
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 8
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0930
## 2 Yunnan China 24.5 101. YUN East 0.0819
## 3 Hunan China 27.6 112. HUN East 0.101
## 4 Okinawa Japan 26.5 128. OKI East 0.150
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.169
## 7 Kagoshima Japan 31.6 131. KAG East 0.154
## 8 Tainan Taiwan 23.0 120. TAI East 0.213
## 9 Bengaluru India 13.0 77.6 BEN South 0.0974
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.225
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>
Mean by country
# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
group_by(Country) |>
summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
left_join(country_means, by = "Country")
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 9
## Pop_City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0930
## 2 Yunnan China 24.5 101. YUN East 0.0819
## 3 Hunan China 27.6 112. HUN East 0.101
## 4 Okinawa Japan 26.5 128. OKI East 0.150
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.169
## 7 Kagoshima Japan 31.6 131. KAG East 0.154
## 8 Tainan Taiwan 23.0 120. TAI East 0.213
## 9 Bengaluru India 13.0 77.6 BEN South 0.0974
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.225
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>
Mean by latitude
# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))
# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
group_by(Latitude_group) |>
summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
ungroup() # Ungroup the data
# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
left_join(summary_by_latitude, by = "Latitude_group")
# Rename columns
fst3 <- fst3 |>
dplyr::rename(
City = Pop_City
)
# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 11
## City Country Latitude Longitude Abbreviation Region mean_fst
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Hainan China 19.2 110. HAI East 0.0930
## 2 Yunnan China 24.5 101. YUN East 0.0819
## 3 Hunan China 27.6 112. HUN East 0.101
## 4 Okinawa Japan 26.5 128. OKI East 0.150
## 5 Kanazawa Japan 36.6 137. KAN East 0.198
## 6 Utsunomiya Japan 36.6 140. UTS East 0.169
## 7 Kagoshima Japan 31.6 131. KAG East 0.154
## 8 Tainan Taiwan 23.0 120. TAI East 0.213
## 9 Bengaluru India 13.0 77.6 BEN South 0.0974
## 10 Kunfunadhoo Maldives 5.67 73 KUN South 0.225
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## # Latitude_group <chr>, mean_fst_by_latitude <dbl>
fst4 <- fst3 |>
dplyr::select(
Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
)
fst4 <- fst4 |>
arrange(
Latitude_group, Region, Country, City
)
# Round
fst4 <- fst4 |>
mutate_if(is.numeric, ~ round(., 2))
head(fst4)
## # A tibble: 6 × 9
## Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
## <chr> <dbl> <chr> <dbl> <chr>
## 1 Above 30N 0.17 East 0.14 Japan
## 2 Above 30N 0.17 East 0.14 Japan
## 3 Above 30N 0.17 East 0.14 Japan
## 4 Below 30N 0.13 East 0.14 China
## 5 Below 30N 0.13 East 0.14 China
## 6 Below 30N 0.13 East 0.14 China
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## # Abbreviation <chr>, mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
font.family = "Arial",
font.size = 9,
big.mark = ",",
theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)
# Then create the flextable object
flex_table <- flextable(fst4) |>
set_caption(caption = as_paragraph(
as_chunk(
"Table 1. Fst values using SNPs after prunning with r2 0.1.",
props = fp_text_default(color = "#000000", font.size = 14)
)
),
fp_p = fp_par(text.align = "center", padding = 5))
# Print the flextable
flex_table
Latitude_group | mean_fst_by_latitude | Region | mean_fst_by_region | Country | mean_fst_by_country | City | Abbreviation | mean_fst |
---|---|---|---|---|---|---|---|---|
Above 30N | 0.17 | East | 0.14 | Japan | 0.17 | Kagoshima | KAG | 0.15 |
Above 30N | 0.17 | East | 0.14 | Japan | 0.17 | Kanazawa | KAN | 0.20 |
Above 30N | 0.17 | East | 0.14 | Japan | 0.17 | Utsunomiya | UTS | 0.17 |
Below 30N | 0.13 | East | 0.14 | China | 0.09 | Hainan | HAI | 0.09 |
Below 30N | 0.13 | East | 0.14 | China | 0.09 | Hunan | HUN | 0.10 |
Below 30N | 0.13 | East | 0.14 | China | 0.09 | Yunnan | YUN | 0.08 |
Below 30N | 0.13 | East | 0.14 | Japan | 0.17 | Okinawa | OKI | 0.15 |
Below 30N | 0.13 | East | 0.14 | Taiwan | 0.21 | Tainan | TAI | 0.21 |
Below 30N | 0.13 | South | 0.16 | India | 0.10 | Bengaluru | BEN | 0.10 |
Below 30N | 0.13 | South | 0.16 | Maldives | 0.23 | Kunfunadhoo | KUN | 0.23 |
Below 30N | 0.13 | South | 0.16 | Nepal | 0.14 | Kathmandu | KAT | 0.14 |
Below 30N | 0.13 | Southeast | 0.12 | Cambodia | 0.08 | Phnom Penh | CAM | 0.08 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.17 | Jakarta | INJ | 0.16 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.17 | Sulawesi (Forest) | SUF | 0.17 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.17 | Sulawesi (Urban) | SUU | 0.13 |
Below 30N | 0.13 | Southeast | 0.12 | Indonesia | 0.17 | Wainyapu | INW | 0.20 |
Below 30N | 0.13 | Southeast | 0.12 | Malaysia | 0.11 | Kuala Lumpur | KLP | 0.13 |
Below 30N | 0.13 | Southeast | 0.12 | Malaysia | 0.11 | Tambun | MAT | 0.09 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.08 | Chanthaburi | CHA | 0.08 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.08 | Kanchanaburi | KAC | 0.08 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.08 | Lampang | LAM | 0.09 |
Below 30N | 0.13 | Southeast | 0.12 | Thailand | 0.08 | Sisaket | SSK | 0.08 |
Below 30N | 0.13 | Southeast | 0.12 | Vietnam | 0.12 | Hanoi | HAN | 0.10 |
Below 30N | 0.13 | Southeast | 0.12 | Vietnam | 0.12 | Ho Chi Minh City | HOC | 0.10 |
Below 30N | 0.13 | Southeast | 0.12 | Vietnam | 0.12 | Quy Nhon City | QNC | 0.15 |
# Initialize Word document
doc <-
read_docx() |>
body_add_flextable(value = flex_table)
# Define the output path with 'here' library
output_path <- here(
"output",
"fst",
"fst_r2_0.1_SNPS.docx"
)
# Save the Word document
print(doc, target = output_path)
To make scatter plot
# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
dplyr::group_by(Country) |>
dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))
# save the data
saveRDS(aggregated_data, here(
"output", "fst", "r2_0.1_country.rds"
))
# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]
# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)
# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)
# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)
ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
geom_point(aes(color = Country), size = 3) +
geom_line(aes(y = fitted_values), color = "blue") + # Fitted line
labs(
title = "Mean Fst by Country",
x = "Ordered Countries",
y = "Mean Fst Value"
) +
scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none")
Estimate distances
# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)
# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)
# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation
# Calculate the distances
for (i in 1:n) {
for (j in 1:n) {
if (i != j) {
coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
}
}
}
# Print the distance matrix
head(distance_matrix)
## HAI YUN HUN OKI KAN UTS KAG
## HAI 0.0000 1035.308 965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081 0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN 965.7432 1107.931 0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283 0.000 1390.3468 1589.7383 617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347 0.0000 288.4938 790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738 288.4938 0.0000 1023.2713
## TAI BEN KUN KAT CAM SUF SUU INW
## HAI 1180.1356 3487.311 4241.329 2648.016 987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN 985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI 873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
## INJ KLP MAT SSK KAC CHA LAM HAN
## HAI 2844.575 1984.417 1861.932 722.0761 1218.270 1086.825 1070.0656 441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303 714.4744 601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029 953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
## HOC QNC
## HAI 990.911 604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166
Compare distance and FST
# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]
# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0
# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))
# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))
head(data)
## Distance FST row_index col_index
## 1 0.0000 0.00000000 HAI HAI
## 2 1035.3081 0.08802883 HAI YUN
## 3 965.7432 0.11878133 HAI HUN
## 4 2047.0214 0.19738113 HAI OKI
## 5 3269.9948 0.16804359 HAI KAN
## 6 3527.0034 0.07441617 HAI UTS
Fit linear regression
# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]
# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))
# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
labs(title = "FST vs Distance - All populations",
x = "Log(Distance)",
y = "FST(1-FST)") +
scale_x_continuous(labels = scales::comma) +
theme_classic() +
theme(axis.text.x = element_text(size = 14), # Increase font size for x-axis
axis.text.y = element_text(size = 14
)) # Increase font size for y-axis
## `geom_smooth()` using formula = 'y ~ x'
ggsave(
filename = here("output", "fst", "fst_by_distance_all_samples_LD2.pdf"),
width = 5,
height = 5,
units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'
Subset by country Select coutries with at least 3 sampling localities
countries_with_3_pops <- filtered_sampling_loc %>%
group_by(Country) %>%
filter(n() >= 3) %>%
pull(Country) %>%
unique()
countries_with_3_pops
## [1] "China" "Japan" "Indonesia" "Thailand" "Vietnam"
Do test for each country
results <- list()
for (country in countries_with_3_pops) {
# Extract abbreviations for the country
abbreviations <- filtered_sampling_loc %>%
filter(Country == country) %>%
pull(Abbreviation)
# Subset the data
subset_data <- data %>%
filter(row_index %in% abbreviations & col_index %in% abbreviations)
subset_data <- subset_data[subset_data$Distance > 0, ]
# Perform linear regression
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
results[[country]] <- list(
equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
)
}
results
## $China
## $China$equation
## [1] "y = -0.60267x + 4.278"
##
## $China$r2
## [1] "R^2 = 1.00"
##
##
## $Japan
## $Japan$equation
## [1] "y = 0.04552x + -0.120"
##
## $Japan$r2
## [1] "R^2 = 0.08"
##
##
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.05263x + 0.512"
##
## $Indonesia$r2
## [1] "R^2 = 0.16"
##
##
## $Thailand
## $Thailand$equation
## [1] "y = 0.28828x + -1.569"
##
## $Thailand$r2
## [1] "R^2 = 0.38"
##
##
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.05628x + 0.520"
##
## $Vietnam$r2
## [1] "R^2 = 0.15"
Merge the data
data_merged <- data %>%
left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
rename(Country1 = Country) %>%
left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
dplyr::select(-Pop_City.x, -Pop_City.y) %>%
filter(Country1 == Country) # Ensures the data is within the same country
# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")
# Filter
data_filtered <- data_merged %>%
group_by(Country1) %>%
filter(n() >= 3 & Country1 %in% countries_to_include) %>%
ungroup()
Calculate the linear regression for each country
regression_results <- data_filtered %>%
group_by(Country1) %>%
do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
rowwise() %>%
mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))
Plot it
ggplot(data_filtered, aes(x = log(Distance), y = FST/(1-FST))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ Country1, scales = "free", ncol = 2) +
geom_text(
data = regression_results,
aes(label = paste(equation, r2, sep = "~~")),
x = Inf, y = Inf,
vjust = 2, hjust = 1.2,
size = 3.5,
parse = TRUE,
inherit.aes = FALSE
) +
labs(title = "Fst vs Distance by Country",
x = "Log(Distance)",
y = "Fst(1-Fst)") +
scale_x_continuous(labels = scales::comma) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
ggsave(
filename = here("output", "fst", "fst_by_distance_countries_LD2.pdf"),
width = 6,
height = 8,
units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'
We can merge the FST and distance matrices
# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)
# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]
# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names
# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]
# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
for(j in 1:ncol(merged_matrix)) {
if (i < j) {
# Upper triangular - Fst values with two decimal places
merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
} else if (i > j) {
# Lower triangular - Distance values with zero decimal places
merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
}
}
}
# Now the merged_matrix should be formatted as you need
print(merged_matrix)
## BEN CAM CHA HAI HAN HOC HUN INJ INW KAC
## BEN NA "0.03" "0.02" "0.07" "0.07" "0.08" "0.09" "0.14" "0.17" "0.02"
## CAM "2976" NA "0.00" "0.04" "0.04" "0.05" "0.06" "0.13" "0.16" "0.00"
## CHA "2660" "329" NA "0.05" "0.05" "0.06" "0.07" "0.13" "0.16" "0.00"
## HAI "3487" "987" "1087" NA "0.04" "0.03" "0.04" "0.15" "0.19" "0.05"
## HAN "3134" "1059" "1019" "441" NA "0.05" "0.05" "0.16" "0.22" "0.04"
## HOC "3174" "209" "537" "991" "1146" NA "0.04" "0.17" "0.21" "0.06"
## HUN "3913" "1929" "1956" "966" "954" "1954" NA "0.16" "0.20" "0.07"
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA "0.07" "0.13"
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA "0.17"
## KAC "2377" "647" "320" "1218" "1028" "856" "1981" "2393" "3492" NA
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713"
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456"
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119"
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234"
## KUN "957" "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052"
## LAM "2419" "951" "692" "1070" "733" "1139" "1637" "2844" "3859" "475"
## MAT "2751" "875" "893" "1862" "1894" "912" "2799" "1363" "2650" "1060"
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265"
## QNC "3424" "528" "782" "604" "882" "436" "1565" "2241" "2880" "1047"
## SSK "2894" "402" "368" "722" "678" "547" "1595" "2390" "3271" "530"
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859" "2874"
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973" "3147"
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986" "3562" "3635" "2397"
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722"
## YUN "2806" "1487" "1323" "1035" "601" "1627" "1108" "3467" "4320" "1179"
## KAG KAN KAT KLP KUN LAM MAT OKI QNC SSK
## BEN "0.15" "0.19" "0.11" "0.08" "0.13" "0.03" "0.04" "0.14" "0.10" "0.03"
## CAM "0.12" "0.17" "0.10" "0.05" "0.15" "0.01" "0.02" "0.11" "0.08" "0.01"
## CHA "0.13" "0.18" "0.10" "0.06" "0.15" "0.01" "0.02" "0.12" "0.08" "0.00"
## HAI "0.10" "0.14" "0.13" "0.08" "0.19" "0.05" "0.06" "0.09" "0.12" "0.05"
## HAN "0.12" "0.17" "0.14" "0.09" "0.22" "0.05" "0.06" "0.11" "0.13" "0.05"
## HOC "0.10" "0.15" "0.15" "0.09" "0.21" "0.06" "0.07" "0.10" "0.13" "0.06"
## HUN "0.08" "0.12" "0.15" "0.10" "0.20" "0.07" "0.08" "0.07" "0.14" "0.07"
## INJ "0.22" "0.27" "0.09" "0.17" "0.26" "0.13" "0.14" "0.21" "0.20" "0.13"
## INW "0.26" "0.32" "0.11" "0.24" "0.34" "0.17" "0.18" "0.26" "0.24" "0.16"
## KAC "0.13" "0.18" "0.10" "0.06" "0.17" "0.01" "0.02" "0.12" "0.08" "0.00"
## KAG NA "0.13" "0.21" "0.17" "0.26" "0.14" "0.14" "0.12" "0.19" "0.13"
## KAN "791" NA "0.25" "0.22" "0.32" "0.19" "0.19" "0.17" "0.24" "0.18"
## KAT "4367" "4880" NA "0.15" "0.23" "0.11" "0.11" "0.20" "0.18" "0.10"
## KLP "4376" "5163" "3242" NA "0.24" "0.06" "0.06" "0.16" "0.14" "0.06"
## KUN "6631" "7300" "2779" "3196" NA "0.17" "0.17" "0.26" "0.24" "0.16"
## LAM "3450" "4160" "1789" "1704" "3202" NA "0.02" "0.12" "0.09" "0.01"
## MAT "4289" "5073" "3069" "177" "3124" "1531" NA "0.13" "0.10" "0.02"
## OKI "618" "1390" "4204" "3822" "6262" "3060" "3747" NA "0.19" "0.12"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA "0.08"
## SSK "3232" "3991" "2412" "1365" "3582" "624" "1217" "2758" "548" NA
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873" "1548" "1886"
## UTS "1023" "288" "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714" "2208" "2678" "1449" "1087"
## SUF SUU TAI UTS YUN
## BEN "0.15" "0.11" "0.20" "0.17" "0.02"
## CAM "0.14" "0.09" "0.17" "0.14" "0.00"
## CHA "0.14" "0.10" "0.18" "0.15" "0.00"
## HAI "0.16" "0.09" "0.14" "0.11" "0.05"
## HAN "0.18" "0.11" "0.18" "0.13" "0.04"
## HOC "0.17" "0.10" "0.16" "0.12" "0.06"
## HUN "0.17" "0.10" "0.13" "0.09" "0.07"
## INJ "0.08" "0.09" "0.27" "0.24" "0.13"
## INW "0.10" "0.13" "0.34" "0.28" "0.16"
## KAC "0.15" "0.10" "0.19" "0.15" "-0.00"
## KAG "0.23" "0.17" "0.19" "0.09" "0.13"
## KAN "0.27" "0.21" "0.23" "0.10" "0.18"
## KAT "0.06" "0.07" "0.26" "0.22" "0.10"
## KLP "0.20" "0.14" "0.24" "0.19" "0.05"
## KUN "0.30" "0.25" "0.34" "0.28" "0.16"
## LAM "0.15" "0.10" "0.19" "0.16" "0.01"
## MAT "0.15" "0.11" "0.19" "0.16" "0.02"
## OKI "0.22" "0.16" "0.17" "0.14" "0.12"
## QNC "0.22" "0.17" "0.25" "0.21" "0.08"
## SSK "0.14" "0.10" "0.18" "0.15" "0.00"
## SUF NA "0.04" "0.29" "0.24" "0.14"
## SUU "381" NA "0.22" "0.18" "0.10"
## TAI "2777" "2747" NA "0.20" "0.18"
## UTS "4755" "4566" "2418" NA "0.15"
## YUN "3566" "3744" "1928" "3902" NA
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
We can sort by distance
# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)
# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))
# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]
# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]
# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
for(j in 1:ncol(merged_matrix)) {
if (i < j) {
merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
} else if (i > j) {
merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
}
}
}
# Print the merged matrix
print(merged_matrix)
## SSK QNC CAM HOC CHA HAI HAN KAC LAM YUN
## SSK NA "0.08" "0.01" "0.06" "0.00" "0.05" "0.05" "0.00" "0.01" "0.00"
## QNC "548" NA "0.08" "0.13" "0.08" "0.12" "0.13" "0.08" "0.09" "0.08"
## CAM "402" "528" NA "0.05" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## HOC "547" "436" "209" NA "0.06" "0.03" "0.05" "0.06" "0.06" "0.06"
## CHA "368" "782" "329" "537" NA "0.05" "0.05" "0.00" "0.01" "0.00"
## HAI "722" "604" "987" "991" "1087" NA "0.04" "0.05" "0.05" "0.05"
## HAN "678" "882" "1059" "1146" "1019" "441" NA "0.04" "0.05" "0.04"
## KAC "530" "1047" "647" "856" "320" "1218" "1028" NA "0.01" "0.00"
## LAM "624" "1155" "951" "1139" "692" "1070" "733" "475" NA "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601" "1179" "714" NA
## MAT "1217" "1348" "875" "912" "893" "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966" "954" "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
## MAT HUN KLP TAI OKI SUF INJ SUU KAT KAG
## SSK "0.02" "0.07" "0.06" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## QNC "0.10" "0.14" "0.14" "0.25" "0.19" "0.22" "0.20" "0.17" "0.18" "0.19"
## CAM "0.02" "0.06" "0.05" "0.17" "0.11" "0.14" "0.13" "0.09" "0.10" "0.12"
## HOC "0.07" "0.04" "0.09" "0.16" "0.10" "0.17" "0.17" "0.10" "0.15" "0.10"
## CHA "0.02" "0.07" "0.06" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## HAI "0.06" "0.04" "0.08" "0.14" "0.09" "0.16" "0.15" "0.09" "0.13" "0.10"
## HAN "0.06" "0.05" "0.09" "0.18" "0.11" "0.18" "0.16" "0.11" "0.14" "0.12"
## KAC "0.02" "0.07" "0.06" "0.19" "0.12" "0.15" "0.13" "0.10" "0.10" "0.13"
## LAM "0.02" "0.07" "0.06" "0.19" "0.12" "0.15" "0.13" "0.10" "0.11" "0.14"
## YUN "0.02" "0.07" "0.05" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## MAT NA "0.08" "0.06" "0.19" "0.13" "0.15" "0.14" "0.11" "0.11" "0.14"
## HUN "2799" NA "0.10" "0.13" "0.07" "0.17" "0.16" "0.10" "0.15" "0.08"
## KLP "177" "2932" NA "0.24" "0.16" "0.20" "0.17" "0.14" "0.15" "0.17"
## TAI "2894" "986" "2979" NA "0.17" "0.29" "0.27" "0.22" "0.26" "0.19"
## OKI "3747" "1599" "3822" "873" NA "0.22" "0.21" "0.16" "0.20" "0.12"
## SUF "2220" "3406" "2115" "2777" "3280" NA "0.08" "0.04" "0.06" "0.23"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA "0.09" "0.09" "0.22"
## SUU "2565" "3468" "2469" "2747" "3154" "381" "1911" NA "0.07" "0.17"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA "0.21"
## KAG "4289" "1860" "4376" "1397" "618" "3893" "4901" "3756" "4367" NA
## INW "2650" "4248" "2505" "3635" "4110" "859" "1528" "973" "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791"
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
## INW BEN KAN UTS KUN
## SSK "0.16" "0.03" "0.18" "0.15" "0.16"
## QNC "0.24" "0.10" "0.24" "0.21" "0.24"
## CAM "0.16" "0.03" "0.17" "0.14" "0.15"
## HOC "0.21" "0.08" "0.15" "0.12" "0.21"
## CHA "0.16" "0.02" "0.18" "0.15" "0.15"
## HAI "0.19" "0.07" "0.14" "0.11" "0.19"
## HAN "0.22" "0.07" "0.17" "0.13" "0.22"
## KAC "0.17" "0.02" "0.18" "0.15" "0.17"
## LAM "0.17" "0.03" "0.19" "0.16" "0.17"
## YUN "0.16" "0.02" "0.18" "0.15" "0.16"
## MAT "0.18" "0.04" "0.19" "0.16" "0.17"
## HUN "0.20" "0.09" "0.12" "0.09" "0.20"
## KLP "0.24" "0.08" "0.22" "0.19" "0.24"
## TAI "0.34" "0.20" "0.23" "0.20" "0.34"
## OKI "0.26" "0.14" "0.17" "0.14" "0.26"
## SUF "0.10" "0.15" "0.27" "0.24" "0.30"
## INJ "0.07" "0.14" "0.27" "0.24" "0.26"
## SUU "0.13" "0.11" "0.21" "0.18" "0.25"
## KAT "0.11" "0.11" "0.25" "0.22" "0.23"
## KAG "0.26" "0.15" "0.13" "0.09" "0.26"
## INW NA "0.17" "0.32" "0.28" "0.34"
## BEN "5347" NA "0.19" "0.17" "0.13"
## KAN "5424" "6410" NA "0.10" "0.32"
## UTS "5539" "6696" "288" NA "0.28"
## KUN "5513" "957" "7300" "7583" NA
Make a table and save as word document
# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)
# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]
# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))
ft
Population | SSK | QNC | CAM | HOC | CHA | HAI | HAN | KAC | LAM | YUN | MAT | HUN | KLP | TAI | OKI | SUF | INJ | SUU | KAT | KAG | INW | BEN | KAN | UTS | KUN |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SSK | 0.08 | 0.01 | 0.06 | 0.00 | 0.05 | 0.05 | 0.00 | 0.01 | 0.00 | 0.02 | 0.07 | 0.06 | 0.18 | 0.12 | 0.14 | 0.13 | 0.10 | 0.10 | 0.13 | 0.16 | 0.03 | 0.18 | 0.15 | 0.16 | |
QNC | 548 | 0.08 | 0.13 | 0.08 | 0.12 | 0.13 | 0.08 | 0.09 | 0.08 | 0.10 | 0.14 | 0.14 | 0.25 | 0.19 | 0.22 | 0.20 | 0.17 | 0.18 | 0.19 | 0.24 | 0.10 | 0.24 | 0.21 | 0.24 | |
CAM | 402 | 528 | 0.05 | 0.00 | 0.04 | 0.04 | 0.00 | 0.01 | 0.00 | 0.02 | 0.06 | 0.05 | 0.17 | 0.11 | 0.14 | 0.13 | 0.09 | 0.10 | 0.12 | 0.16 | 0.03 | 0.17 | 0.14 | 0.15 | |
HOC | 547 | 436 | 209 | 0.06 | 0.03 | 0.05 | 0.06 | 0.06 | 0.06 | 0.07 | 0.04 | 0.09 | 0.16 | 0.10 | 0.17 | 0.17 | 0.10 | 0.15 | 0.10 | 0.21 | 0.08 | 0.15 | 0.12 | 0.21 | |
CHA | 368 | 782 | 329 | 537 | 0.05 | 0.05 | 0.00 | 0.01 | 0.00 | 0.02 | 0.07 | 0.06 | 0.18 | 0.12 | 0.14 | 0.13 | 0.10 | 0.10 | 0.13 | 0.16 | 0.02 | 0.18 | 0.15 | 0.15 | |
HAI | 722 | 604 | 987 | 991 | 1087 | 0.04 | 0.05 | 0.05 | 0.05 | 0.06 | 0.04 | 0.08 | 0.14 | 0.09 | 0.16 | 0.15 | 0.09 | 0.13 | 0.10 | 0.19 | 0.07 | 0.14 | 0.11 | 0.19 | |
HAN | 678 | 882 | 1059 | 1146 | 1019 | 441 | 0.04 | 0.05 | 0.04 | 0.06 | 0.05 | 0.09 | 0.18 | 0.11 | 0.18 | 0.16 | 0.11 | 0.14 | 0.12 | 0.22 | 0.07 | 0.17 | 0.13 | 0.22 | |
KAC | 530 | 1047 | 647 | 856 | 320 | 1218 | 1028 | 0.01 | 0.00 | 0.02 | 0.07 | 0.06 | 0.19 | 0.12 | 0.15 | 0.13 | 0.10 | 0.10 | 0.13 | 0.17 | 0.02 | 0.18 | 0.15 | 0.17 | |
LAM | 624 | 1155 | 951 | 1139 | 692 | 1070 | 733 | 475 | 0.01 | 0.02 | 0.07 | 0.06 | 0.19 | 0.12 | 0.15 | 0.13 | 0.10 | 0.11 | 0.14 | 0.17 | 0.03 | 0.19 | 0.16 | 0.17 | |
YUN | 1087 | 1449 | 1487 | 1627 | 1323 | 1035 | 601 | 1179 | 714 | 0.02 | 0.07 | 0.05 | 0.18 | 0.12 | 0.14 | 0.13 | 0.10 | 0.10 | 0.13 | 0.16 | 0.02 | 0.18 | 0.15 | 0.16 | |
MAT | 1217 | 1348 | 875 | 912 | 893 | 1862 | 1894 | 1060 | 1531 | 2208 | 0.08 | 0.06 | 0.19 | 0.13 | 0.15 | 0.14 | 0.11 | 0.11 | 0.14 | 0.18 | 0.04 | 0.19 | 0.16 | 0.17 | |
HUN | 1595 | 1565 | 1929 | 1954 | 1956 | 966 | 954 | 1981 | 1637 | 1108 | 2799 | 0.10 | 0.13 | 0.07 | 0.17 | 0.16 | 0.10 | 0.15 | 0.08 | 0.20 | 0.09 | 0.12 | 0.09 | 0.20 | |
KLP | 1365 | 1445 | 1003 | 1011 | 1055 | 1984 | 2042 | 1234 | 1704 | 2375 | 177 | 2932 | 0.24 | 0.16 | 0.20 | 0.17 | 0.14 | 0.15 | 0.17 | 0.24 | 0.08 | 0.22 | 0.19 | 0.24 | |
TAI | 1886 | 1548 | 2061 | 1982 | 2237 | 1180 | 1497 | 2397 | 2219 | 1928 | 2894 | 986 | 2979 | 0.17 | 0.29 | 0.27 | 0.22 | 0.26 | 0.19 | 0.34 | 0.20 | 0.23 | 0.20 | 0.34 | |
OKI | 2758 | 2410 | 2929 | 2839 | 3110 | 2047 | 2328 | 3265 | 3060 | 2678 | 3747 | 1599 | 3822 | 873 | 0.22 | 0.21 | 0.16 | 0.20 | 0.12 | 0.26 | 0.14 | 0.17 | 0.14 | 0.26 | |
SUF | 2567 | 2117 | 2245 | 2046 | 2557 | 2614 | 2986 | 2874 | 3182 | 3566 | 2220 | 3406 | 2115 | 2777 | 3280 | 0.08 | 0.04 | 0.06 | 0.23 | 0.10 | 0.15 | 0.27 | 0.24 | 0.30 | |
INJ | 2390 | 2241 | 1989 | 1889 | 2160 | 2845 | 3034 | 2393 | 2844 | 3467 | 1363 | 3805 | 1188 | 3562 | 4294 | 1535 | 0.09 | 0.09 | 0.22 | 0.07 | 0.14 | 0.27 | 0.24 | 0.26 | |
SUU | 2797 | 2309 | 2506 | 2300 | 2827 | 2752 | 3151 | 3147 | 3420 | 3744 | 2565 | 3468 | 2469 | 2747 | 3154 | 381 | 1911 | 0.07 | 0.17 | 0.13 | 0.11 | 0.21 | 0.18 | 0.25 | |
KAT | 2412 | 2922 | 2724 | 2921 | 2424 | 2648 | 2207 | 2119 | 1789 | 1640 | 3069 | 2611 | 3242 | 3537 | 4204 | 4967 | 4429 | 5209 | 0.21 | 0.11 | 0.11 | 0.25 | 0.22 | 0.23 | |
KAG | 3232 | 2942 | 3446 | 3377 | 3595 | 2510 | 2721 | 3713 | 3450 | 2967 | 4289 | 1860 | 4376 | 1397 | 618 | 3893 | 4901 | 3756 | 4367 | 0.26 | 0.15 | 0.13 | 0.09 | 0.26 | |
INW | 3271 | 2880 | 2908 | 2727 | 3190 | 3419 | 3762 | 3492 | 3859 | 4320 | 2650 | 4248 | 2505 | 3635 | 4110 | 859 | 1528 | 973 | 5611 | 4718 | 0.17 | 0.32 | 0.28 | 0.34 | |
BEN | 2894 | 3424 | 2976 | 3174 | 2660 | 3487 | 3134 | 2377 | 2419 | 2806 | 2751 | 3913 | 2869 | 4631 | 5450 | 4970 | 3876 | 5307 | 1828 | 5773 | 5347 | 0.19 | 0.17 | 0.13 | |
KAN | 3991 | 3725 | 4223 | 4161 | 4357 | 3270 | 3444 | 4456 | 4160 | 3619 | 5073 | 2532 | 5163 | 2185 | 1390 | 4621 | 5685 | 4452 | 4880 | 791 | 5424 | 6410 | 0.10 | 0.32 | |
UTS | 4249 | 3965 | 4470 | 4399 | 4613 | 3527 | 3715 | 4722 | 4435 | 3902 | 5311 | 2811 | 5396 | 2418 | 1590 | 4755 | 5874 | 4566 | 5167 | 1023 | 5539 | 6696 | 288 | 0.28 | |
KUN | 3582 | 4070 | 3572 | 3749 | 3288 | 4241 | 3934 | 3052 | 3202 | 3681 | 3124 | 4777 | 3196 | 5414 | 6262 | 5293 | 3986 | 5658 | 2779 | 6631 | 5513 | 957 | 7300 | 7583 |
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6194515 330.9 11233262 600.0 NA 11233262 600.0
## Vcells 10497539 80.1 18029751 137.6 32768 14958126 114.2
Import the data
Intergenic
## # A tibble: 6 × 2
## Country mean_fst
## <chr> <dbl>
## 1 Cambodia 0.05
## 2 China 0.07
## 3 India 0.07
## 4 Indonesia 0.11
## 5 Japan 0.12
## 6 Malaysia 0.09
LD1
## # A tibble: 6 × 2
## Country mean_fst
## <chr> <dbl>
## 1 Cambodia 0.08
## 2 China 0.09
## 3 India 0.1
## 4 Indonesia 0.16
## 5 Japan 0.17
## 6 Malaysia 0.11
LD2
## # A tibble: 6 × 2
## Country mean_fst
## <chr> <dbl>
## 1 Cambodia 0.08
## 2 China 0.09
## 3 India 0.1
## 4 Indonesia 0.17
## 5 Japan 0.17
## 6 Malaysia 0.11
Merge the data sets
# Add an identifier column to each dataset
intergenic$dataset <- "intergenic"
LD1$dataset <- "LD1"
LD2$dataset <- "LD2"
# Merge the datasets
combined_data <- rbind(intergenic, LD1, LD2)
# Assign a numeric index for plotting based on ordered countries
combined_data <- combined_data %>%
arrange(Country) %>%
group_by(Country) %>%
mutate(index = dense_rank(Country))
# Custom color mapping
color_mapping <- scale_color_manual(
values = c(intergenic = "gray", LD1 = "blue", LD2 = "orange"),
name = "Dataset"
)
# Shapes
shape_mapping <- scale_shape_manual(
values = c(intergenic = 16, LD1 = 17, LD2 = 18),
name = "Dataset"
)
# Scatter plot with custom colors and shapes
plot <- ggplot(combined_data, aes(x = Country, y = mean_fst, color = dataset, shape = dataset)) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE, aes(group = dataset)) + # Fitted line per dataset
labs(
title = "",
x = "Countries",
y = "Mean Fst"
) +
color_mapping +
shape_mapping +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "top")
# Display the plot
print(plot)
## `geom_smooth()` using formula = 'y ~ x'
ggsave(
filename = here("output", "fst", "fst_datasets_country.pdf"),
plot,
width = 7,
height = 5,
units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--make-bed \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log
Then make sure to edit the line “BEN a 0 0 0 -9” to “BEN b 0 0 0 -9”
Then convert to raw format
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/mantel \
--recodeA \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log
Import the daat and covert it to genind format
# import the data
albo <-
read.PLINK(
here("output", "fst", "mantel.raw"),
quiet = FALSE,
chunkSize = 1000,
parallel = require("parallel"),
n.cores = 4
)
# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)
Load it
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
The fam file
fam_file <- here(
"output", "fst", "mantel.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>%
# mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Merge
# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))
head(loc_albo)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1 OKI 1001 0 0 2 -9 Okinawa Japan
## 2 OKI 1002 0 0 2 -9 Okinawa Japan
## 3 OKI 1003 0 0 2 -9 Okinawa Japan
## 4 OKI 1004 0 0 2 -9 Okinawa Japan
## 5 OKI 1005 0 0 2 -9 Okinawa Japan
## 6 OKI 1006 0 0 1 -9 Okinawa Japan
## Latitude Longitude Region
## 1 26.5013 127.9454 Asia
## 2 26.5013 127.9454 Asia
## 3 26.5013 127.9454 Asia
## 4 26.5013 127.9454 Asia
## 5 26.5013 127.9454 Asia
## 6 26.5013 127.9454 Asia
Get the latitude and longitude
## [,1] [,2]
## [1,] 26.5013 127.9454
## [2,] 26.5013 127.9454
## [3,] 26.5013 127.9454
## [4,] 26.5013 127.9454
## [5,] 26.5013 127.9454
## [6,] 26.5013 127.9454
Add jitter
## x y
## [1,] 26.51917 127.9537
## [2,] 26.51278 127.9527
## [3,] 26.52068 127.9563
## [4,] 26.52102 127.9391
## [5,] 26.51013 127.9400
## [6,] 26.51191 127.9547
Add to object
Save
Isolation by distance
Convert the data
##
## Converting data from a genind to a genpop object...
##
## ...done.
Get 1 mosquito per population, it is just to get the geographical coordinates
unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
inds_in_pop <- which(albo2@pop == unique_populations[i])
selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]
Mantel test
Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
##
## Observation: 0.225538
##
## Based on 999 replicates
## Simulated p-value: 0.074
## Alternative hypothesis: greater
##
## Std.Obs Expectation Variance
## 1.504698485 0.005512092 0.021382012
Explanations: Observation: This is the observed Mantel statistic for your data, which is 0.225605. It represents the correlation between the two matrices. A positive value indicates a positive correlation, meaning as one distance increases, so does the other.
Monte-Carlo Test: This test involves repeatedly randomizing one of the matrices and calculating the Mantel statistic for each randomized version. This process creates a distribution of Mantel statistics under the null hypothesis (which usually states that there is no association between the matrices).
Simulated p-value: The p-value is 0.059. This value represents the proportion of replicates in the Monte-Carlo test where the Mantel statistic was greater than or equal to the observed statistic (0.225605). A p-value close to 0.05 (like ours, 0.059) suggests that the observed correlation might not be due to random chance, but it’s not conventionally ‘significant’ as it’s slightly above the typical cutoff of 0.05.
Alternative Hypothesis: greater: This indicates that the test was one-sided, testing whether the observed Mantel statistic is greater than what would be expected by chance.
Std.Obs (Standardized Observation): This is your observed Mantel statistic standardized (1.626624021) using the expectation and variance from the Monte-Carlo test.
Expectation and Variance: These are the mean (-0.003941264) and variance (0.019914350) of the Mantel statistic under the null hypothesis, calculated from the Monte-Carlo replicates.
Interpretation: The observed Mantel statistic (0.225605) suggests a moderate positive correlation between genetic and geographic distances. The p-value of 0.059 is close to 0.05, indicating a trend towards significance, but not strong enough to confidently reject the null hypothesis of no correlation. The test was specifically looking to see if the genetic distance increases with geographic distance (one-sided test). The standardized observation (Std.Obs) being positive and relatively large suggests that the observed correlation is somewhat higher than what would be expected by chance.
Plot
## quartz_off_screen
## 2
plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)
Add the equation
# Plotting the data
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)
# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared
# Generating the equation string
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)
# Adding the equation and R^2 to the plot
# You can adjust the position (e.g., x and y values) as necessary
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.9, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.85, labels = r2_label)
Use library MASS for plot
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
# width = 5,
# height = 4)
png(here("output", "fst", "ibd2.png"),
width = 5,
height = 4,
units='in',
res = 300)
myPal <-
colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared
# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)
# Adding the equation and R^2 to the plot
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.5, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.45, labels = r2_label)
title("Isolation by distance")
dev.off()
## quartz_off_screen
## 2
Check the populations - I did not include those with less than 4 mosquitoes
## BEN CAM CHA HAI HAN HOC HUN INJ INW KAC KAG KAN KAT KLP KUN LAM MAT OKI QNC SSK
## 12 12 12 12 4 7 12 11 4 6 12 11 10 4 4 9 12 11 12 12
## SUF SUU TAI UTS YUN
## 6 6 8 12 9