Libraries

library(StAMPP)
library(ggplot2)
library(tidyverse)
library(adegenet)
library(here)
library(flextable)
library(officer)
library(reshape2)
library(dplyr)
library(tidyr)
library(geosphere)
library(flextable)
library(officer)
library(dartR)
library(MASS)
library(Cairo)

We can use different data sets to run our fst estimates.

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 5834115 311.6    9855684 526.4         NA  7331362 391.6
## Vcells 9598989  73.3   14832738 113.2      32768 12293949  93.8

1. Intergenic SNPs

We can estimate Fst using only the neutral set of SNPs for populations with at least 4 individuals.

Create list of populations

awk '{print $1}' output/populations/snps_sets/neutral.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4fst.txt;
head  output/fst/pops_4fst.txt;
wc -l output/fst/pops_4fst.txt
## BEN
## CAM
## CHA
## HAI
## HAN
## HOC
## HUN
## INJ
## INW
## KAC
##       25 output/fst/pops_4fst.txt

We have 25 populations with 4 or more mosquitoes. We can convert to raw format and subset the bed file

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/neutral \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/neutral \
--silent;
grep 'samples\|variants\|remaining' output/fst/neutral.log
## 9483 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.96967.
## 9483 variants and 230 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

neutral <-
  read.PLINK(
    here(
      "output", "fst", "neutral.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

summary(neutral)

Now lets convert the genlight object to Stampp format, and estimate pairwide Fst values

The command below would also work, but you can simplify it and put only the numbers: genome_equal_2 <- stamppFst(neutral, nboots=100, percent=95 + nclusters==10)

This chunk will take a couple minutes to run.

# convert
neutral_2 <- stamppConvert(neutral, type="genlight")

# run stampp. If you want to run with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
neutral_3 <- stamppFst(neutral_2, 1, 95, 1)

Save it

saveRDS(
  neutral_3, here(
    "output", "fst", "neutral.rds"
  )
)

To load it

neutral_3 <- readRDS(
  here(
    "output", "fst", "neutral.rds"
  )
)

Now lets look at the object

summary(neutral_3)
##       OKI               HAI               YUN                 KAT         
##  Min.   :0.06693   Min.   :0.02807   Min.   :-0.002161   Min.   :0.06445  
##  1st Qu.:0.08088   1st Qu.:0.03572   1st Qu.: 0.017006   1st Qu.:0.07817  
##  Median :0.09944   Median :0.05998   Median : 0.055814   Median :0.09368  
##  Mean   :0.11276   Mean   :0.06803   Mean   : 0.056186   Mean   :0.11228  
##  3rd Qu.:0.14177   3rd Qu.:0.09611   3rd Qu.: 0.082102   3rd Qu.:0.13943  
##  Max.   :0.21810   Max.   :0.15999   Max.   : 0.143292   Max.   :0.21573  
##  NA's   :1         NA's   :2         NA's   :3           NA's   :4        
##       TAI              HUN               HAN               KLP         
##  Min.   :0.1233   Min.   :0.03590   Min.   :0.03056   Min.   :0.04714  
##  1st Qu.:0.1495   1st Qu.:0.04296   1st Qu.:0.04201   1st Qu.:0.05136  
##  Median :0.1642   Median :0.05908   Median :0.08134   Median :0.10614  
##  Mean   :0.1780   Mean   :0.07034   Mean   :0.08233   Mean   :0.10225  
##  3rd Qu.:0.2007   3rd Qu.:0.09196   3rd Qu.:0.11386   3rd Qu.:0.12868  
##  Max.   :0.3004   Max.   :0.16409   Max.   :0.19157   Max.   :0.21872  
##  NA's   :5        NA's   :6         NA's   :7         NA's   :8        
##       HOC               QNC               SSK                KAC           
##  Min.   :0.03383   Min.   :0.07153   Min.   :0.000295   Min.   :-0.000224  
##  1st Qu.:0.04598   1st Qu.:0.07691   1st Qu.:0.010202   1st Qu.: 0.014289  
##  Median :0.07343   Median :0.13239   Median :0.069284   Median : 0.075309  
##  Mean   :0.08110   Mean   :0.12117   Mean   :0.056040   Mean   : 0.060492  
##  3rd Qu.:0.10792   3rd Qu.:0.14787   3rd Qu.:0.083976   3rd Qu.: 0.085694  
##  Max.   :0.17718   Max.   :0.21538   Max.   :0.134878   Max.   : 0.145295  
##  NA's   :9         NA's   :10        NA's   :11         NA's   :12         
##       CHA                KAN               UTS               CAM          
##  Min.   :0.003151   Min.   :0.09212   Min.   :0.07893   Min.   :0.008014  
##  1st Qu.:0.017775   1st Qu.:0.11198   1st Qu.:0.09267   1st Qu.:0.019739  
##  Median :0.077002   Median :0.12201   Median :0.10527   Median :0.070429  
##  Mean   :0.063989   Mean   :0.14434   Mean   :0.12345   Mean   :0.064043  
##  3rd Qu.:0.087944   3rd Qu.:0.16900   3rd Qu.:0.14159   3rd Qu.:0.081284  
##  Max.   :0.131817   Max.   :0.23897   Max.   :0.21422   Max.   :0.135431  
##  NA's   :13         NA's   :14        NA's   :15        NA's   :16        
##       KAG               BEN               LAM               SUF         
##  Min.   :0.08122   Min.   :0.02333   Min.   :0.01692   Min.   :0.03162  
##  1st Qu.:0.09061   1st Qu.:0.05390   1st Qu.:0.06908   1st Qu.:0.06323  
##  Median :0.12139   Median :0.08997   Median :0.08394   Median :0.06969  
##  Mean   :0.12750   Mean   :0.07831   Mean   :0.08446   Mean   :0.09810  
##  3rd Qu.:0.14741   3rd Qu.:0.10404   3rd Qu.:0.10618   3rd Qu.:0.09388  
##  Max.   :0.20375   Max.   :0.11899   Max.   :0.14422   Max.   :0.23208  
##  NA's   :17        NA's   :18        NA's   :19        NA's   :20       
##       SUU               KUN              INW               INJ         
##  Min.   :0.06603   Min.   :0.1486   Min.   :0.06648   Min.   :0.08835  
##  1st Qu.:0.07088   1st Qu.:0.1799   1st Qu.:0.08016   1st Qu.:0.08835  
##  Median :0.08605   Median :0.2113   Median :0.09384   Median :0.08835  
##  Mean   :0.11206   Mean   :0.2123   Mean   :0.09384   Mean   :0.08835  
##  3rd Qu.:0.12723   3rd Qu.:0.2441   3rd Qu.:0.10751   3rd Qu.:0.08835  
##  Max.   :0.21012   Max.   :0.2769   Max.   :0.12119   Max.   :0.08835  
##  NA's   :21        NA's   :22       NA's   :23        NA's   :24       
##       MAT     
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :25

If you want you can save the fst values as csv.

# Convert to data frame
neutral_df <- data.frame(neutral_3)

# Save it
write.csv(neutral_df, file = here("output", "fst", "neutral_df.csv"))

Check the Fst values

head(neutral_df)
##            OKI        HAI        YUN       KAT       TAI HUN HAN KLP HOC QNC
## OKI         NA         NA         NA        NA        NA  NA  NA  NA  NA  NA
## HAI 0.07268706         NA         NA        NA        NA  NA  NA  NA  NA  NA
## YUN 0.07937617 0.03397451         NA        NA        NA  NA  NA  NA  NA  NA
## KAT 0.14886465 0.10179042 0.07762791        NA        NA  NA  NA  NA  NA  NA
## TAI 0.15614563 0.12689237 0.14329227 0.2157330        NA  NA  NA  NA  NA  NA
## HUN 0.06693096 0.02807325 0.04026851 0.1034478 0.1233325  NA  NA  NA  NA  NA
##     SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## YUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KAT  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

We will convert the data into a matrix.

aa <- as.matrix(neutral_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##            OKI        HAI        YUN        KAT       TAI        HUN        HAN
## OKI         NA 0.07268706 0.07937617 0.14886465 0.1561456 0.06693096 0.09294157
## HAI 0.07268706         NA 0.03397451 0.10179042 0.1268924 0.02807325 0.03807541
## YUN 0.07937617 0.03397451         NA 0.07762791 0.1432923 0.04026851 0.03421635
## KAT 0.14886465 0.10179042 0.07762791         NA 0.2157330 0.10344776 0.11088115
## TAI 0.15614563 0.12689237 0.14329227 0.21573302        NA 0.12333248 0.16471928
## HUN 0.06693096 0.02807325 0.04026851 0.10344776 0.1233325         NA 0.04385787
##            KLP        HOC        QNC         SSK          KAC          CHA
## OKI 0.12753263 0.07966691 0.14381991 0.080917127  0.082764633 0.0807573335
## HAI 0.07062727 0.02857376 0.09737992 0.036399112  0.031226075 0.0350480370
## YUN 0.04892423 0.03983731 0.07317901 0.001820884 -0.002160858 0.0007563386
## KAT 0.12941018 0.11252413 0.14504693 0.078173740  0.078066760 0.0766586019
## TAI 0.20155700 0.14637599 0.20875730 0.142121310  0.152625899 0.1397690667
## HUN 0.07440170 0.03618065 0.10249995 0.042059268  0.038117472 0.0413984260
##            KAN        UTS         CAM        KAG        BEN        LAM
## OKI 0.13456942 0.10960357 0.075330223 0.10320709 0.09568234 0.08756987
## HAI 0.09711038 0.07278214 0.030013229 0.06963227 0.04835553 0.03913709
## YUN 0.10828863 0.08297531 0.002435562 0.07597810 0.01756076 0.00724616
## KAT 0.17428707 0.14702796 0.075728451 0.13942968 0.08425904 0.08100140
## TAI 0.19294422 0.17076422 0.136334355 0.16363753 0.15654314 0.15057531
## HUN 0.08888333 0.06669664 0.035903383 0.05908263 0.05334759 0.04549865
##            SUF        SUU       KUN        INW        INJ        MAT
## OKI 0.14108869 0.11063606 0.2181045 0.18145357 0.14398625 0.09257954
## HAI 0.09101011 0.05998414 0.1599904 0.12872924 0.09510255 0.04482997
## YUN 0.08325162 0.06270272 0.1364487 0.10513394 0.07948148 0.01682092
## KAT 0.06444802 0.06708021 0.2034093 0.09367875 0.08599529 0.09162392
## TAI 0.21266171 0.17599097 0.3003592 0.26676013 0.20046117 0.15440513
## HUN 0.09503274 0.06400679 0.1640872 0.13267018 0.09920459 0.05348803

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    Region2, Country
  )

# Check it
head(sampling_loc)
## # A tibble: 6 × 7
##   Pop_City Country Latitude Longitude Region Abbreviation Region2  
##   <chr>    <chr>      <dbl>     <dbl> <chr>  <chr>        <chr>    
## 1 Hainan   China       19.2      110. Asia   HAI          East Asia
## 2 Yunnan   China       24.5      101. Asia   YUN          East Asia
## 3 Hunan    China       27.6      112. Asia   HUN          East Asia
## 4 Okinawa  Japan       26.5      128. Asia   OKI          East Asia
## 5 Sendai   Japan       38.3      141. Asia   SEN          East Asia
## 6 Nagasaki Japan       32.8      130. Asia   NAG          East Asia

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##  [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"

Create vector with order of populations

# Extract the populations that appear in neutral_df
populations_in_neutral <- colnames(neutral_df)

# Reorder the populations based on order_pops
poporder <- populations_in_neutral[populations_in_neutral %in% order_pops]

# Print the reordered populations
print(poporder)
##  [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1          Var2         value        
##  OKI    : 25   OKI    : 25   Min.   :-0.0022  
##  HAI    : 25   HAI    : 25   1st Qu.: 0.0589  
##  YUN    : 25   YUN    : 25   Median : 0.0871  
##  KAT    : 25   KAT    : 25   Mean   : 0.0959  
##  TAI    : 25   TAI    : 25   3rd Qu.: 0.1325  
##  HUN    : 25   HUN    : 25   Max.   : 0.3004  
##  (Other):475   (Other):475   NA's   :325

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

ggsave(
  filename = here("output", "fst", "fst_matrix_neutral.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

By country

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(neutral_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##             Cambodia      China      India  Indonesia      Japan   Malaysia
## Cambodia          NA 0.02278406 0.01973887 0.08102356 0.07791002 0.03291285
## China     0.02278406         NA 0.03975463 0.09135918 0.07657845 0.05151535
## India     0.01973887 0.03975463         NA 0.09289674 0.10153368 0.05126178
## Indonesia 0.08102356 0.09135918 0.09289674         NA 0.14968381 0.11478857
## Japan     0.07791002 0.07657845 0.10153368 0.14968381         NA 0.11864271
## Malaysia  0.03291285 0.05151535 0.05126178 0.11478857 0.11864271         NA
## Maldives  0.13543120 0.15350879 0.11898677 0.23259152 0.21875971 0.18365581
## Nepal     0.07572845 0.09657847 0.08425904 0.07780057 0.15122311 0.11051705
## Taiwan    0.13633435 0.12921240 0.15654314 0.21396849 0.16596381 0.17798106
## Thailand  0.00533491 0.02637889 0.02054397 0.08322300 0.08912422 0.03349593
## Vietnam   0.04581641 0.05486669 0.06564601 0.12518159 0.10974240 0.08606174
##            Maldives      Nepal    Taiwan   Thailand    Vietnam
## Cambodia  0.1354312 0.07572845 0.1363344 0.00533491 0.04581641
## China     0.1535088 0.09657847 0.1292124 0.02637889 0.05486669
## India     0.1189868 0.08425904 0.1565431 0.02054397 0.06564601
## Indonesia 0.2325915 0.07780057 0.2139685 0.08322300 0.12518159
## Japan     0.2187597 0.15122311 0.1659638 0.08912422 0.10974240
## Malaysia  0.1836558 0.11051705 0.1779811 0.03349593 0.08606174
## Maldives         NA 0.20340929 0.3003592 0.13905356 0.19471054
## Nepal     0.2034093         NA 0.2157330 0.07847512 0.12281740
## Taiwan    0.3003592 0.21573302        NA 0.14627290 0.17328419
## Thailand  0.1390536 0.07847512 0.1462729         NA 0.05095433
## Vietnam   0.1947105 0.12281740 0.1732842 0.05095433         NA
symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)
##           Cambodia      China      India  Indonesia      Japan   Malaysia
## Cambodia        NA 0.02278406 0.01973887 0.08102356 0.07791002 0.03291285
## China           NA         NA 0.03975463 0.09135918 0.07657845 0.05151535
## India           NA         NA         NA 0.09289674 0.10153368 0.05126178
## Indonesia       NA         NA         NA         NA 0.14968381 0.11478857
## Japan           NA         NA         NA         NA         NA 0.11864271
## Malaysia        NA         NA         NA         NA         NA         NA
## Maldives        NA         NA         NA         NA         NA         NA
## Nepal           NA         NA         NA         NA         NA         NA
## Taiwan          NA         NA         NA         NA         NA         NA
## Thailand        NA         NA         NA         NA         NA         NA
## Vietnam         NA         NA         NA         NA         NA         NA
##            Maldives      Nepal    Taiwan   Thailand    Vietnam
## Cambodia  0.1354312 0.07572845 0.1363344 0.00533491 0.04581641
## China     0.1535088 0.09657847 0.1292124 0.02637889 0.05486669
## India     0.1189868 0.08425904 0.1565431 0.02054397 0.06564601
## Indonesia 0.2325915 0.07780057 0.2139685 0.08322300 0.12518159
## Japan     0.2187597 0.15122311 0.1659638 0.08912422 0.10974240
## Malaysia  0.1836558 0.11051705 0.1779811 0.03349593 0.08606174
## Maldives         NA 0.20340929 0.3003592 0.13905356 0.19471054
## Nepal            NA         NA 0.2157330 0.07847512 0.12281740
## Taiwan           NA         NA        NA 0.14627290 0.17328419
## Thailand         NA         NA        NA         NA 0.05095433
## Vietnam          NA         NA        NA         NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)
##         Var1           Var2        value        
##  Cambodia :11   Cambodia :11   Min.   :0.00533  
##  China    :11   China    :11   1st Qu.:0.07069  
##  India    :11   India    :11   Median :0.10153  
##  Indonesia:11   Indonesia:11   Mean   :0.11113  
##  Japan    :11   Japan    :11   3rd Qu.:0.15045  
##  Malaysia :11   Malaysia :11   Max.   :0.30036  
##  (Other)  :55   (Other)  :55   NA's   :66

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "fst", "fst_matrix_neutral_by_country.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 25 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 OKI     0.113 
##  2 HAI     0.0682
##  3 YUN     0.0562
##  4 KAT     0.112 
##  5 TAI     0.175 
##  6 HUN     0.0708
##  7 HAN     0.0819
##  8 KLP     0.103 
##  9 HOC     0.0777
## 10 QNC     0.123 
## # ℹ 15 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)

# Rename the Region2 column to Region
fst3 <- fst3 |>
  dplyr::rename(Region = Region2)

# check output
head(fst3)
## # A tibble: 6 × 7
##   Pop_City   Country Latitude Longitude Abbreviation Region mean_fst
##   <chr>      <chr>      <dbl>     <dbl> <chr>        <chr>     <dbl>
## 1 Hainan     China       19.2      110. HAI          East     0.0682
## 2 Yunnan     China       24.5      101. YUN          East     0.0562
## 3 Hunan      China       27.6      112. HUN          East     0.0708
## 4 Okinawa    Japan       26.5      128. OKI          East     0.113 
## 5 Kanazawa   Japan       36.6      137. KAN          East     0.136 
## 6 Utsunomiya Japan       36.6      140. UTS          East     0.112

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 8
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0682
##  2 Yunnan      China       24.5      101.  YUN          East     0.0562
##  3 Hunan       China       27.6      112.  HUN          East     0.0708
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.113 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.136 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.112 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.106 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.175 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0689
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.189 
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>

Mean by country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 9
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0682
##  2 Yunnan      China       24.5      101.  YUN          East     0.0562
##  3 Hunan       China       27.6      112.  HUN          East     0.0708
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.113 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.136 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.112 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.106 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.175 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0689
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.189 
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City
  )

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 11
##    City        Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0682
##  2 Yunnan      China       24.5      101.  YUN          East     0.0562
##  3 Hunan       China       27.6      112.  HUN          East     0.0708
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.113 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.136 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.112 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.106 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.175 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0689
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.189 
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## #   Latitude_group <chr>, mean_fst_by_latitude <dbl>
fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
## # A tibble: 6 × 9
##   Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
##   <chr>                         <dbl> <chr>               <dbl> <chr>  
## 1 Above 30N                      0.12 East                  0.1 Japan  
## 2 Above 30N                      0.12 East                  0.1 Japan  
## 3 Above 30N                      0.12 East                  0.1 Japan  
## 4 Below 30N                      0.09 East                  0.1 China  
## 5 Below 30N                      0.09 East                  0.1 China  
## 6 Below 30N                      0.09 East                  0.1 China  
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## #   Abbreviation <chr>, mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values using intergenic SNPs.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values using intergenic SNPs.

Latitude_group

mean_fst_by_latitude

Region

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Above 30N

0.12

East

0.10

Japan

0.12

Kagoshima

KAG

0.11

Above 30N

0.12

East

0.10

Japan

0.12

Kanazawa

KAN

0.14

Above 30N

0.12

East

0.10

Japan

0.12

Utsunomiya

UTS

0.11

Below 30N

0.09

East

0.10

China

0.07

Hainan

HAI

0.07

Below 30N

0.09

East

0.10

China

0.07

Hunan

HUN

0.07

Below 30N

0.09

East

0.10

China

0.07

Yunnan

YUN

0.06

Below 30N

0.09

East

0.10

Japan

0.12

Okinawa

OKI

0.11

Below 30N

0.09

East

0.10

Taiwan

0.18

Tainan

TAI

0.18

Below 30N

0.09

South

0.12

India

0.07

Bengaluru

BEN

0.07

Below 30N

0.09

South

0.12

Maldives

0.19

Kunfunadhoo

KUN

0.19

Below 30N

0.09

South

0.12

Nepal

0.11

Kathmandu

KAT

0.11

Below 30N

0.09

Southeast

0.09

Cambodia

0.05

Phnom Penh

CAM

0.05

Below 30N

0.09

Southeast

0.09

Indonesia

0.11

Jakarta

INJ

0.11

Below 30N

0.09

Southeast

0.09

Indonesia

0.11

Sulawesi (Forest)

SUF

0.11

Below 30N

0.09

Southeast

0.09

Indonesia

0.11

Sulawesi (Urban)

SUU

0.09

Below 30N

0.09

Southeast

0.09

Indonesia

0.11

Wainyapu

INW

0.14

Below 30N

0.09

Southeast

0.09

Malaysia

0.09

Kuala Lumpur

KLP

0.10

Below 30N

0.09

Southeast

0.09

Malaysia

0.09

Tambun

MAT

0.07

Below 30N

0.09

Southeast

0.09

Thailand

0.06

Chanthaburi

CHA

0.06

Below 30N

0.09

Southeast

0.09

Thailand

0.06

Kanchanaburi

KAC

0.06

Below 30N

0.09

Southeast

0.09

Thailand

0.06

Lampang

LAM

0.06

Below 30N

0.09

Southeast

0.09

Thailand

0.06

Sisaket

SSK

0.06

Below 30N

0.09

Southeast

0.09

Vietnam

0.09

Hanoi

HAN

0.08

Below 30N

0.09

Southeast

0.09

Vietnam

0.09

Ho Chi Minh City

HOC

0.08

Below 30N

0.09

Southeast

0.09

Vietnam

0.09

Quy Nhon City

QNC

0.12

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "fst", 
  "fst_neutral_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "fst", "neutral_country.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           HAI      YUN       HUN      OKI       KAN       UTS       KAG
## HAI    0.0000 1035.308  965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081    0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN  965.7432 1107.931    0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283    0.000 1390.3468 1589.7383  617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347    0.0000  288.4938  790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738  288.4938    0.0000 1023.2713
##           TAI      BEN      KUN      KAT       CAM      SUF      SUU      INW
## HAI 1180.1356 3487.311 4241.329 2648.016  987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN  985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI  873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
##          INJ      KLP      MAT       SSK      KAC      CHA       LAM       HAN
## HAI 2844.575 1984.417 1861.932  722.0761 1218.270 1086.825 1070.0656  441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303  714.4744  601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029  953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
##          HOC       QNC
## HAI  990.911  604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

data <- data |>
  dplyr::arrange(
    Distance
  )
head(data)
##   Distance FST row_index col_index
## 1        0   0       HAI       HAI
## 2        0   0       YUN       YUN
## 3        0   0       HUN       HUN
## 4        0   0       OKI       OKI
## 5        0   0       KAN       KAN
## 6        0   0       UTS       UTS

Fit linear regression

# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))


# Plot
ggplot(data, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(data$Distance) * 0.85, y = max(data$FST) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Distance (Km)",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_all_samples.pdf"),
  width = 6,
  height = 4,
  units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'

Subset by country Select coutries with at least 3 sampling localities

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops
## [1] "China"     "Japan"     "Indonesia" "Thailand"  "Vietnam"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  
  # Perform linear regression
  lm_model <- lm(FST ~ Distance, data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results
## $China
## $China$equation
## [1] "y = -0.00032x + 0.395"
## 
## $China$r2
## [1] "R^2 = 0.87"
## 
## 
## $Japan
## $Japan$equation
## [1] "y = 0.00002x + 0.111"
## 
## $Japan$r2
## [1] "R^2 = 0.02"
## 
## 
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.00002x + 0.105"
## 
## $Indonesia$r2
## [1] "R^2 = 0.11"
## 
## 
## $Thailand
## $Thailand$equation
## [1] "y = 0.00038x + -0.063"
## 
## $Thailand$r2
## [1] "R^2 = 0.46"
## 
## 
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.00002x + 0.108"
## 
## $Vietnam$r2
## [1] "R^2 = 0.07"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate the linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST ~ Distance, data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("y = %.3fx + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("R^2 = %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(data = regression_results, aes(label = paste(equation, r2, sep = "\n"), x = Inf, y = Inf), 
            vjust = 2, hjust = 2, size = 3.5, inherit.aes = FALSE) +
  labs(title = "FST vs Distance by Country",
       x = "Distance",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_countries.pdf"),
  width = 6,
  height = 8,
  units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     BEN    CAM    CHA    HAI    HAN    HOC    HUN    INJ    INW    KAC    
## BEN NA     "0.02" "0.02" "0.05" "0.05" "0.06" "0.05" "0.09" "0.12" "0.02" 
## CAM "2976" NA     "0.00" "0.03" "0.03" "0.03" "0.04" "0.08" "0.11" "0.00" 
## CHA "2660" "329"  NA     "0.04" "0.03" "0.04" "0.04" "0.08" "0.10" "-0.00"
## HAI "3487" "987"  "1087" NA     "0.04" "0.03" "0.03" "0.10" "0.13" "0.03" 
## HAN "3134" "1059" "1019" "441"  NA     "0.04" "0.04" "0.11" "0.16" "0.03" 
## HOC "3174" "209"  "537"  "991"  "1146" NA     "0.04" "0.11" "0.15" "0.04" 
## HUN "3913" "1929" "1956" "966"  "954"  "1954" NA     "0.10" "0.13" "0.04" 
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA     "0.07" "0.08" 
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA     "0.11" 
## KAC "2377" "647"  "320"  "1218" "1028" "856"  "1981" "2393" "3492" NA     
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713" 
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456" 
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119" 
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234" 
## KUN "957"  "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052" 
## LAM "2419" "951"  "692"  "1070" "733"  "1139" "1637" "2844" "3859" "475"  
## MAT "2751" "875"  "893"  "1862" "1894" "912"  "2799" "1363" "2650" "1060" 
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265" 
## QNC "3424" "528"  "782"  "604"  "882"  "436"  "1565" "2241" "2880" "1047" 
## SSK "2894" "402"  "368"  "722"  "678"  "547"  "1595" "2390" "3271" "530"  
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859"  "2874" 
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973"  "3147" 
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986"  "3562" "3635" "2397" 
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722" 
## YUN "2806" "1487" "1323" "1035" "601"  "1627" "1108" "3467" "4320" "1179" 
##     KAG    KAN    KAT    KLP    KUN    LAM    MAT    OKI    QNC    SSK   
## BEN "0.09" "0.12" "0.08" "0.07" "0.12" "0.02" "0.03" "0.10" "0.09" "0.02"
## CAM "0.07" "0.10" "0.08" "0.05" "0.14" "0.01" "0.02" "0.08" "0.07" "0.00"
## CHA "0.08" "0.11" "0.08" "0.05" "0.13" "0.01" "0.02" "0.08" "0.07" "0.00"
## HAI "0.07" "0.10" "0.10" "0.07" "0.16" "0.04" "0.04" "0.07" "0.10" "0.04"
## HAN "0.09" "0.12" "0.11" "0.08" "0.19" "0.04" "0.05" "0.09" "0.12" "0.04"
## HOC "0.08" "0.11" "0.11" "0.08" "0.18" "0.05" "0.05" "0.08" "0.11" "0.04"
## HUN "0.06" "0.09" "0.10" "0.07" "0.16" "0.05" "0.05" "0.07" "0.10" "0.04"
## INJ "0.14" "0.17" "0.09" "0.12" "0.21" "0.08" "0.09" "0.14" "0.14" "0.08"
## INW "0.17" "0.21" "0.09" "0.18" "0.28" "0.11" "0.12" "0.18" "0.18" "0.11"
## KAC "0.08" "0.11" "0.08" "0.05" "0.15" "0.00" "0.01" "0.08" "0.07" "0.00"
## KAG NA     "0.11" "0.14" "0.12" "0.20" "0.08" "0.09" "0.10" "0.14" "0.08"
## KAN "791"  NA     "0.17" "0.15" "0.24" "0.11" "0.12" "0.13" "0.17" "0.11"
## KAT "4367" "4880" NA     "0.13" "0.20" "0.08" "0.09" "0.15" "0.15" "0.08"
## KLP "4376" "5163" "3242" NA     "0.22" "0.05" "0.06" "0.13" "0.13" "0.05"
## KUN "6631" "7300" "2779" "3196" NA     "0.14" "0.15" "0.22" "0.22" "0.13"
## LAM "3450" "4160" "1789" "1704" "3202" NA     "0.02" "0.09" "0.08" "0.01"
## MAT "4289" "5073" "3069" "177"  "3124" "1531" NA     "0.09" "0.09" "0.02"
## OKI "618"  "1390" "4204" "3822" "6262" "3060" "3747" NA     "0.14" "0.08"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA     "0.07"
## SSK "3232" "3991" "2412" "1365" "3582" "624"  "1217" "2758" "548"  NA    
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873"  "1548" "1886"
## UTS "1023" "288"  "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714"  "2208" "2678" "1449" "1087"
##     SUF    SUU    TAI    UTS    YUN    
## BEN "0.09" "0.07" "0.16" "0.10" "0.02" 
## CAM "0.08" "0.06" "0.14" "0.08" "0.00" 
## CHA "0.08" "0.06" "0.14" "0.08" "0.00" 
## HAI "0.09" "0.06" "0.13" "0.07" "0.03" 
## HAN "0.12" "0.08" "0.16" "0.10" "0.03" 
## HOC "0.11" "0.07" "0.15" "0.08" "0.04" 
## HUN "0.10" "0.06" "0.12" "0.07" "0.04" 
## INJ "0.06" "0.07" "0.20" "0.14" "0.08" 
## INW "0.07" "0.10" "0.27" "0.18" "0.11" 
## KAC "0.08" "0.06" "0.15" "0.09" "-0.00"
## KAG "0.14" "0.11" "0.16" "0.08" "0.08" 
## KAN "0.17" "0.14" "0.19" "0.09" "0.11" 
## KAT "0.06" "0.07" "0.22" "0.15" "0.08" 
## KLP "0.14" "0.11" "0.20" "0.13" "0.05" 
## KUN "0.23" "0.21" "0.30" "0.21" "0.14" 
## LAM "0.09" "0.07" "0.15" "0.09" "0.01" 
## MAT "0.09" "0.07" "0.15" "0.10" "0.02" 
## OKI "0.14" "0.11" "0.16" "0.11" "0.08" 
## QNC "0.15" "0.13" "0.21" "0.15" "0.07" 
## SSK "0.08" "0.06" "0.14" "0.08" "0.00" 
## SUF NA     "0.03" "0.21" "0.14" "0.08" 
## SUU "381"  NA     "0.18" "0.11" "0.06" 
## TAI "2777" "2747" NA     "0.17" "0.14" 
## UTS "4755" "4566" "2418" NA     "0.08" 
## YUN "3566" "3744" "1928" "3902" NA
cities <- readRDS(here("output", "populations", "cities.rds"))
head(cities)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     SSK    QNC    CAM    HOC    CHA    HAI    HAN    KAC    LAM    YUN   
## SSK NA     "0.07" "0.00" "0.04" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## QNC "548"  NA     "0.07" "0.11" "0.07" "0.10" "0.12" "0.07" "0.08" "0.07"
## CAM "402"  "528"  NA     "0.03" "0.00" "0.03" "0.03" "0.00" "0.01" "0.00"
## HOC "547"  "436"  "209"  NA     "0.04" "0.03" "0.04" "0.04" "0.05" "0.04"
## CHA "368"  "782"  "329"  "537"  NA     "0.04" "0.03" "0.00" "0.01" "0.00"
## HAI "722"  "604"  "987"  "991"  "1087" NA     "0.04" "0.03" "0.04" "0.03"
## HAN "678"  "882"  "1059" "1146" "1019" "441"  NA     "0.03" "0.04" "0.03"
## KAC "530"  "1047" "647"  "856"  "320"  "1218" "1028" NA     "0.00" "0.00"
## LAM "624"  "1155" "951"  "1139" "692"  "1070" "733"  "475"  NA     "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601"  "1179" "714"  NA    
## MAT "1217" "1348" "875"  "912"  "893"  "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966"  "954"  "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
##     MAT    HUN    KLP    TAI    OKI    SUF    INJ    SUU    KAT    KAG   
## SSK "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## QNC "0.09" "0.10" "0.13" "0.21" "0.14" "0.15" "0.14" "0.13" "0.15" "0.14"
## CAM "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.07"
## HOC "0.05" "0.04" "0.08" "0.15" "0.08" "0.11" "0.11" "0.07" "0.11" "0.08"
## CHA "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## HAI "0.04" "0.03" "0.07" "0.13" "0.07" "0.09" "0.10" "0.06" "0.10" "0.07"
## HAN "0.05" "0.04" "0.08" "0.16" "0.09" "0.12" "0.11" "0.08" "0.11" "0.09"
## KAC "0.01" "0.04" "0.05" "0.15" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## LAM "0.02" "0.05" "0.05" "0.15" "0.09" "0.09" "0.08" "0.07" "0.08" "0.08"
## YUN "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## MAT NA     "0.05" "0.06" "0.15" "0.09" "0.09" "0.09" "0.07" "0.09" "0.09"
## HUN "2799" NA     "0.07" "0.12" "0.07" "0.10" "0.10" "0.06" "0.10" "0.06"
## KLP "177"  "2932" NA     "0.20" "0.13" "0.14" "0.12" "0.11" "0.13" "0.12"
## TAI "2894" "986"  "2979" NA     "0.16" "0.21" "0.20" "0.18" "0.22" "0.16"
## OKI "3747" "1599" "3822" "873"  NA     "0.14" "0.14" "0.11" "0.15" "0.10"
## SUF "2220" "3406" "2115" "2777" "3280" NA     "0.06" "0.03" "0.06" "0.14"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA     "0.07" "0.09" "0.14"
## SUU "2565" "3468" "2469" "2747" "3154" "381"  "1911" NA     "0.07" "0.11"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA     "0.14"
## KAG "4289" "1860" "4376" "1397" "618"  "3893" "4901" "3756" "4367" NA    
## INW "2650" "4248" "2505" "3635" "4110" "859"  "1528" "973"  "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791" 
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
##     INW    BEN    KAN    UTS    KUN   
## SSK "0.11" "0.02" "0.11" "0.08" "0.13"
## QNC "0.18" "0.09" "0.17" "0.15" "0.22"
## CAM "0.11" "0.02" "0.10" "0.08" "0.14"
## HOC "0.15" "0.06" "0.11" "0.08" "0.18"
## CHA "0.10" "0.02" "0.11" "0.08" "0.13"
## HAI "0.13" "0.05" "0.10" "0.07" "0.16"
## HAN "0.16" "0.05" "0.12" "0.10" "0.19"
## KAC "0.11" "0.02" "0.11" "0.09" "0.15"
## LAM "0.11" "0.02" "0.11" "0.09" "0.14"
## YUN "0.11" "0.02" "0.11" "0.08" "0.14"
## MAT "0.12" "0.03" "0.12" "0.10" "0.15"
## HUN "0.13" "0.05" "0.09" "0.07" "0.16"
## KLP "0.18" "0.07" "0.15" "0.13" "0.22"
## TAI "0.27" "0.16" "0.19" "0.17" "0.30"
## OKI "0.18" "0.10" "0.13" "0.11" "0.22"
## SUF "0.07" "0.09" "0.17" "0.14" "0.23"
## INJ "0.07" "0.09" "0.17" "0.14" "0.21"
## SUU "0.10" "0.07" "0.14" "0.11" "0.21"
## KAT "0.09" "0.08" "0.17" "0.15" "0.20"
## KAG "0.17" "0.09" "0.11" "0.08" "0.20"
## INW NA     "0.12" "0.21" "0.18" "0.28"
## BEN "5347" NA     "0.12" "0.10" "0.12"
## KAN "5424" "6410" NA     "0.09" "0.24"
## UTS "5539" "6696" "288"  NA     "0.21"
## KUN "5513" "957"  "7300" "7583" NA

Make a table and save as word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]


# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))

ft

Population

SSK

QNC

CAM

HOC

CHA

HAI

HAN

KAC

LAM

YUN

MAT

HUN

KLP

TAI

OKI

SUF

INJ

SUU

KAT

KAG

INW

BEN

KAN

UTS

KUN

SSK

0.07

0.00

0.04

0.00

0.04

0.04

0.00

0.01

0.00

0.02

0.04

0.05

0.14

0.08

0.08

0.08

0.06

0.08

0.08

0.11

0.02

0.11

0.08

0.13

QNC

548

0.07

0.11

0.07

0.10

0.12

0.07

0.08

0.07

0.09

0.10

0.13

0.21

0.14

0.15

0.14

0.13

0.15

0.14

0.18

0.09

0.17

0.15

0.22

CAM

402

528

0.03

0.00

0.03

0.03

0.00

0.01

0.00

0.02

0.04

0.05

0.14

0.08

0.08

0.08

0.06

0.08

0.07

0.11

0.02

0.10

0.08

0.14

HOC

547

436

209

0.04

0.03

0.04

0.04

0.05

0.04

0.05

0.04

0.08

0.15

0.08

0.11

0.11

0.07

0.11

0.08

0.15

0.06

0.11

0.08

0.18

CHA

368

782

329

537

0.04

0.03

0.00

0.01

0.00

0.02

0.04

0.05

0.14

0.08

0.08

0.08

0.06

0.08

0.08

0.10

0.02

0.11

0.08

0.13

HAI

722

604

987

991

1087

0.04

0.03

0.04

0.03

0.04

0.03

0.07

0.13

0.07

0.09

0.10

0.06

0.10

0.07

0.13

0.05

0.10

0.07

0.16

HAN

678

882

1059

1146

1019

441

0.03

0.04

0.03

0.05

0.04

0.08

0.16

0.09

0.12

0.11

0.08

0.11

0.09

0.16

0.05

0.12

0.10

0.19

KAC

530

1047

647

856

320

1218

1028

0.00

0.00

0.01

0.04

0.05

0.15

0.08

0.08

0.08

0.06

0.08

0.08

0.11

0.02

0.11

0.09

0.15

LAM

624

1155

951

1139

692

1070

733

475

0.01

0.02

0.05

0.05

0.15

0.09

0.09

0.08

0.07

0.08

0.08

0.11

0.02

0.11

0.09

0.14

YUN

1087

1449

1487

1627

1323

1035

601

1179

714

0.02

0.04

0.05

0.14

0.08

0.08

0.08

0.06

0.08

0.08

0.11

0.02

0.11

0.08

0.14

MAT

1217

1348

875

912

893

1862

1894

1060

1531

2208

0.05

0.06

0.15

0.09

0.09

0.09

0.07

0.09

0.09

0.12

0.03

0.12

0.10

0.15

HUN

1595

1565

1929

1954

1956

966

954

1981

1637

1108

2799

0.07

0.12

0.07

0.10

0.10

0.06

0.10

0.06

0.13

0.05

0.09

0.07

0.16

KLP

1365

1445

1003

1011

1055

1984

2042

1234

1704

2375

177

2932

0.20

0.13

0.14

0.12

0.11

0.13

0.12

0.18

0.07

0.15

0.13

0.22

TAI

1886

1548

2061

1982

2237

1180

1497

2397

2219

1928

2894

986

2979

0.16

0.21

0.20

0.18

0.22

0.16

0.27

0.16

0.19

0.17

0.30

OKI

2758

2410

2929

2839

3110

2047

2328

3265

3060

2678

3747

1599

3822

873

0.14

0.14

0.11

0.15

0.10

0.18

0.10

0.13

0.11

0.22

SUF

2567

2117

2245

2046

2557

2614

2986

2874

3182

3566

2220

3406

2115

2777

3280

0.06

0.03

0.06

0.14

0.07

0.09

0.17

0.14

0.23

INJ

2390

2241

1989

1889

2160

2845

3034

2393

2844

3467

1363

3805

1188

3562

4294

1535

0.07

0.09

0.14

0.07

0.09

0.17

0.14

0.21

SUU

2797

2309

2506

2300

2827

2752

3151

3147

3420

3744

2565

3468

2469

2747

3154

381

1911

0.07

0.11

0.10

0.07

0.14

0.11

0.21

KAT

2412

2922

2724

2921

2424

2648

2207

2119

1789

1640

3069

2611

3242

3537

4204

4967

4429

5209

0.14

0.09

0.08

0.17

0.15

0.20

KAG

3232

2942

3446

3377

3595

2510

2721

3713

3450

2967

4289

1860

4376

1397

618

3893

4901

3756

4367

0.17

0.09

0.11

0.08

0.20

INW

3271

2880

2908

2727

3190

3419

3762

3492

3859

4320

2650

4248

2505

3635

4110

859

1528

973

5611

4718

0.12

0.21

0.18

0.28

BEN

2894

3424

2976

3174

2660

3487

3134

2377

2419

2806

2751

3913

2869

4631

5450

4970

3876

5307

1828

5773

5347

0.12

0.10

0.12

KAN

3991

3725

4223

4161

4357

3270

3444

4456

4160

3619

5073

2532

5163

2185

1390

4621

5685

4452

4880

791

5424

6410

0.09

0.24

UTS

4249

3965

4470

4399

4613

3527

3715

4722

4435

3902

5311

2811

5396

2418

1590

4755

5874

4566

5167

1023

5539

6696

288

0.21

KUN

3582

4070

3572

3749

3288

4241

3934

3052

3202

3681

3124

4777

3196

5414

6262

5293

3986

5658

2779

6631

5513

957

7300

7583

# Create a new Word document
doc <- read_docx()

# Add the flextable to the Word document
doc <- body_add_flextable(doc, ft)

# Save the Word document
print(doc, target =  here("output", "fst", "intergenic.docx"))

2. SNP set r2 0.01

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  6247934 333.7   11285547 602.8         NA 10548578 563.4
## Vcells 10497484  80.1   18053012 137.8      32768 14977146 114.3
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.01 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.01 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.01.log
## 21095 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.969806.
## 21095 variants and 230 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

r2_0.01 <-
  read.PLINK(
    here(
      "output", "fst", "r2_0.01.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

summary(r2_0.01)

This chunk will take a couple minutes to run.

# convert
r2_0.01_2 <- stamppConvert(r2_0.01, type="genlight")

# run stampp. If you want to runn with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
r2_0.01_3 <- stamppFst(r2_0.01_2, 1, 95, 1)

Save it

saveRDS(
  r2_0.01_3, here(
    "output", "fst", "r2_0.01.rds"
  )
)

To load it

r2_0.01_3 <- readRDS(
  here(
    "output", "fst", "r2_0.01.rds"
  )
)

Now lets look at the object

summary(r2_0.01_3)
##       OKI               HAI               YUN                  KAT         
##  Min.   :0.07411   Min.   :0.03305   Min.   :-0.0000684   Min.   :0.05983  
##  1st Qu.:0.12457   1st Qu.:0.05357   1st Qu.: 0.0202758   1st Qu.:0.09938  
##  Median :0.13500   Median :0.08638   Median : 0.0789742   Median :0.11057  
##  Mean   :0.15247   Mean   :0.09338   Mean   : 0.0821429   Mean   :0.14004  
##  3rd Qu.:0.17245   3rd Qu.:0.13393   3rd Qu.: 0.1383007   3rd Qu.:0.17033  
##  Max.   :0.27404   Max.   :0.19267   Max.   : 0.1838969   Max.   :0.25614  
##  NA's   :1         NA's   :2         NA's   :3            NA's   :4        
##       TAI              HUN               HAN               KLP         
##  Min.   :0.1293   Min.   :0.04315   Min.   :0.03769   Min.   :0.05066  
##  1st Qu.:0.1818   1st Qu.:0.07836   1st Qu.:0.05082   1st Qu.:0.06001  
##  Median :0.1963   Median :0.08788   Median :0.09712   Median :0.13557  
##  Mean   :0.2157   Mean   :0.10679   Mean   :0.10658   Mean   :0.13089  
##  3rd Qu.:0.2423   3rd Qu.:0.12706   3rd Qu.:0.15227   3rd Qu.:0.19086  
##  Max.   :0.3508   Max.   :0.21464   Max.   :0.22468   Max.   :0.23994  
##  NA's   :5        NA's   :6         NA's   :7         NA's   :8        
##       HOC               QNC               SSK                KAC          
##  Min.   :0.04799   Min.   :0.07995   Min.   :0.003314   Min.   :0.001008  
##  1st Qu.:0.06683   1st Qu.:0.08570   1st Qu.:0.011973   1st Qu.:0.018685  
##  Median :0.10139   Median :0.17159   Median :0.110002   Median :0.127829  
##  Mean   :0.11328   Mean   :0.15440   Mean   :0.086747   Mean   :0.096343  
##  3rd Qu.:0.15323   3rd Qu.:0.21452   3rd Qu.:0.148341   3rd Qu.:0.161847  
##  Max.   :0.21621   Max.   :0.24560   Max.   :0.182914   Max.   :0.191394  
##  NA's   :9         NA's   :10        NA's   :11         NA's   :12        
##       CHA                KAN              UTS               CAM          
##  Min.   :0.003687   Min.   :0.1015   Min.   :0.08843   Min.   :0.008604  
##  1st Qu.:0.023862   1st Qu.:0.1789   1st Qu.:0.16593   1st Qu.:0.026054  
##  Median :0.125757   Median :0.1975   Median :0.17580   Median :0.118910  
##  Mean   :0.097930   Mean   :0.2137   Mean   :0.19727   Mean   :0.091140  
##  3rd Qu.:0.149282   3rd Qu.:0.2658   3rd Qu.:0.23861   3rd Qu.:0.127924  
##  Max.   :0.179347   Max.   :0.3277   Max.   :0.29881   Max.   :0.153901  
##  NA's   :13         NA's   :14       NA's   :15        NA's   :16        
##       KAG              BEN               LAM               SUF         
##  Min.   :0.1453   Min.   :0.02933   Min.   :0.01885   Min.   :0.03927  
##  1st Qu.:0.1535   1st Qu.:0.07534   1st Qu.:0.10804   1st Qu.:0.08085  
##  Median :0.1918   Median :0.13014   Median :0.13373   Median :0.09416  
##  Mean   :0.1996   Mean   :0.10628   Mean   :0.11948   Mean   :0.12977  
##  3rd Qu.:0.2348   3rd Qu.:0.13699   3rd Qu.:0.15615   3rd Qu.:0.14373  
##  Max.   :0.2757   Max.   :0.15984   Max.   :0.16749   Max.   :0.29085  
##  NA's   :17       NA's   :18        NA's   :19        NA's   :20       
##       SUU               KUN              INW               INJ        
##  Min.   :0.08632   Min.   :0.1695   Min.   :0.06375   Min.   :0.1298  
##  1st Qu.:0.09984   1st Qu.:0.2147   1st Qu.:0.08902   1st Qu.:0.1298  
##  Median :0.11444   Median :0.2599   Median :0.11429   Median :0.1298  
##  Mean   :0.14125   Mean   :0.2548   Mean   :0.11429   Mean   :0.1298  
##  3rd Qu.:0.15586   3rd Qu.:0.2974   3rd Qu.:0.13955   3rd Qu.:0.1298  
##  Max.   :0.24981   Max.   :0.3349   Max.   :0.16482   Max.   :0.1298  
##  NA's   :21        NA's   :22       NA's   :23        NA's   :24      
##       MAT     
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :25

If you want you can save the fst values as csv.

# Convert to data frame
r2_0.01_df <- data.frame(r2_0.01_3)

# Save it
write.csv(r2_0.01_df, file = here("output", "fst", "r2_0.01_df.csv"))

Check the Fst values

head(r2_0.01_df)
##            OKI        HAI        YUN       KAT       TAI HUN HAN KLP HOC QNC
## OKI         NA         NA         NA        NA        NA  NA  NA  NA  NA  NA
## HAI 0.08801929         NA         NA        NA        NA  NA  NA  NA  NA  NA
## YUN 0.12717041 0.05389545         NA        NA        NA  NA  NA  NA  NA  NA
## KAT 0.19706597 0.13069201 0.09803865        NA        NA  NA  NA  NA  NA  NA
## TAI 0.16437593 0.13725088 0.18359315 0.2561412        NA  NA  NA  NA  NA  NA
## HUN 0.07411078 0.03611331 0.07708983 0.1485081 0.1292503  NA  NA  NA  NA  NA
##     SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## YUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KAT  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Now lets get the Fst values from the object albo3. It has the bootstraps, CI limits, p-values, and Fst values. We will convert the data into a matrix.

aa <- as.matrix(r2_0.01_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##            OKI        HAI        YUN        KAT       TAI        HUN        HAN
## OKI         NA 0.08801929 0.12717041 0.19706597 0.1643759 0.07411078 0.11064999
## HAI 0.08801929         NA 0.05389545 0.13069201 0.1372509 0.03611331 0.03832073
## YUN 0.12717041 0.05389545         NA 0.09803865 0.1835931 0.07708983 0.04600133
## KAT 0.19706597 0.13069201 0.09803865         NA 0.2561412 0.14850811 0.13423035
## TAI 0.16437593 0.13725088 0.18359315 0.25614121        NA 0.12925031 0.17880081
## HUN 0.07411078 0.03611331 0.07708983 0.14850811 0.1292503         NA 0.05046773
##            KLP        HOC        QNC        SSK           KAC         CHA
## OKI 0.16656985 0.09549009 0.19007520 0.12817076  1.311262e-01 0.125744140
## HAI 0.08638337 0.03305455 0.12288092 0.05630853  5.254290e-02 0.053245910
## YUN 0.05670882 0.06185930 0.08085858 0.00366398 -6.838534e-05 0.001604481
## KAT 0.14708471 0.14244369 0.17033112 0.09937748  1.006626e-01 0.097234843
## TAI 0.23998253 0.15425313 0.24922008 0.18283845  1.967321e-01 0.178141681
## HUN 0.10841416 0.04314995 0.14309475 0.08054483  7.900004e-02 0.077728277
##           KAN        UTS         CAM        KAG        BEN         LAM
## OKI 0.1634040 0.13493024 0.114640596 0.12106263 0.14616521 0.132969333
## HAI 0.1371587 0.11075509 0.043819388 0.09793452 0.07341023 0.058877477
## YUN 0.1838969 0.15870552 0.003279041 0.13988820 0.02448527 0.006938259
## KAT 0.2486329 0.22282018 0.095547956 0.20548615 0.10205812 0.103613929
## TAI 0.2220346 0.19580993 0.169327020 0.18371222 0.19967862 0.191790397
## HUN 0.1110223 0.08527642 0.066905755 0.07356335 0.09705144 0.084689086
##            SUF        SUU       KUN       INW        INJ        MAT
## OKI 0.21643883 0.15398736 0.2740437 0.2564655 0.21162371 0.13506645
## HAI 0.14879059 0.08892088 0.1926687 0.1850661 0.14633947 0.06332853
## YUN 0.13353818 0.09595000 0.1591835 0.1508741 0.12218318 0.01887260
## KAT 0.05982659 0.06964651 0.2290544 0.1035899 0.09395829 0.11057327
## TAI 0.28326958 0.21716118 0.3507787 0.3354167 0.26762867 0.18747764
## HUN 0.16381699 0.10041490 0.2146355 0.2000360 0.16129956 0.08787756

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    Region2, Country
  )

# Check it
head(sampling_loc)
## # A tibble: 6 × 7
##   Pop_City Country Latitude Longitude Region Abbreviation Region2  
##   <chr>    <chr>      <dbl>     <dbl> <chr>  <chr>        <chr>    
## 1 Hainan   China       19.2      110. Asia   HAI          East Asia
## 2 Yunnan   China       24.5      101. Asia   YUN          East Asia
## 3 Hunan    China       27.6      112. Asia   HUN          East Asia
## 4 Okinawa  Japan       26.5      128. Asia   OKI          East Asia
## 5 Sendai   Japan       38.3      141. Asia   SEN          East Asia
## 6 Nagasaki Japan       32.8      130. Asia   NAG          East Asia

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##  [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"

Create vector with order of populations

# Extract the populations that appear in neutral_df
populations_in_r2_0.01 <- colnames(r2_0.01_df)

# Reorder the populations based on order_pops
poporder <- populations_in_r2_0.01[populations_in_r2_0.01 %in% order_pops]

# Print the reordered populations
print(poporder)
##  [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with the ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1          Var2         value        
##  OKI    : 25   OKI    : 25   Min.   :-0.0001  
##  HAI    : 25   HAI    : 25   1st Qu.: 0.0797  
##  YUN    : 25   YUN    : 25   Median : 0.1309  
##  KAT    : 25   KAT    : 25   Mean   : 0.1312  
##  TAI    : 25   TAI    : 25   3rd Qu.: 0.1735  
##  HUN    : 25   HUN    : 25   Max.   : 0.3508  
##  (Other):475   (Other):475   NA's   :325

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.01.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

By country

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(r2_0.01_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##              Cambodia      China      India  Indonesia     Japan   Malaysia
## Cambodia           NA 0.03800139 0.02605433 0.12095720 0.1346749 0.03574590
## China     0.038001395         NA 0.06498231 0.14143583 0.1092279 0.07026417
## India     0.026054327 0.06498231         NA 0.13584651 0.1678509 0.05980367
## Indonesia 0.120957203 0.14143583 0.13584651         NA 0.2313825 0.15803996
## Japan     0.134674917 0.10922792 0.16785086 0.23138254        NA 0.18141023
## Malaysia  0.035745900 0.07026417 0.05980367 0.15803996 0.1814102         NA
## Maldives  0.153900858 0.18882922 0.13014258 0.28386514 0.2940620 0.20471972
## Nepal     0.095547956 0.13143672 0.10205812 0.08175532 0.2113562 0.12882899
## Taiwan    0.169327020 0.14483616 0.19967862 0.27586902 0.1824474 0.21373008
## Thailand  0.006331097 0.04625628 0.02756166 0.12861509 0.1536568 0.03828791
## Vietnam   0.055359685 0.06885421 0.08411290 0.17209901 0.1477431 0.09951772
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1539009 0.09554796 0.1693270 0.006331097 0.05535968
## China     0.1888292 0.13143672 0.1448362 0.046256283 0.06885421
## India     0.1301426 0.10205812 0.1996786 0.027561664 0.08411290
## Indonesia 0.2838651 0.08175532 0.2758690 0.128615090 0.17209901
## Japan     0.2940620 0.21135619 0.1824474 0.153656777 0.14774309
## Malaysia  0.2047197 0.12882899 0.2137301 0.038287915 0.09951772
## Maldives         NA 0.22905444 0.3507787 0.162330775 0.22600173
## Nepal     0.2290544         NA 0.2561412 0.100222216 0.14900172
## Taiwan    0.3507787 0.25614121        NA 0.187375652 0.19409134
## Thailand  0.1623308 0.10022222 0.1873757          NA 0.06532773
## Vietnam   0.2260017 0.14900172 0.1940913 0.065327725         NA
symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)
##           Cambodia      China      India Indonesia     Japan   Malaysia
## Cambodia        NA 0.03800139 0.02605433 0.1209572 0.1346749 0.03574590
## China           NA         NA 0.06498231 0.1414358 0.1092279 0.07026417
## India           NA         NA         NA 0.1358465 0.1678509 0.05980367
## Indonesia       NA         NA         NA        NA 0.2313825 0.15803996
## Japan           NA         NA         NA        NA        NA 0.18141023
## Malaysia        NA         NA         NA        NA        NA         NA
## Maldives        NA         NA         NA        NA        NA         NA
## Nepal           NA         NA         NA        NA        NA         NA
## Taiwan          NA         NA         NA        NA        NA         NA
## Thailand        NA         NA         NA        NA        NA         NA
## Vietnam         NA         NA         NA        NA        NA         NA
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1539009 0.09554796 0.1693270 0.006331097 0.05535968
## China     0.1888292 0.13143672 0.1448362 0.046256283 0.06885421
## India     0.1301426 0.10205812 0.1996786 0.027561664 0.08411290
## Indonesia 0.2838651 0.08175532 0.2758690 0.128615090 0.17209901
## Japan     0.2940620 0.21135619 0.1824474 0.153656777 0.14774309
## Malaysia  0.2047197 0.12882899 0.2137301 0.038287915 0.09951772
## Maldives         NA 0.22905444 0.3507787 0.162330775 0.22600173
## Nepal            NA         NA 0.2561412 0.100222216 0.14900172
## Taiwan           NA         NA        NA 0.187375652 0.19409134
## Thailand         NA         NA        NA          NA 0.06532773
## Vietnam          NA         NA        NA          NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)
##         Var1           Var2        value        
##  Cambodia :11   Cambodia :11   Min.   :0.00633  
##  China    :11   China    :11   1st Qu.:0.08293  
##  India    :11   India    :11   Median :0.14144  
##  Indonesia:11   Indonesia:11   Mean   :0.14212  
##  Japan    :11   Japan    :11   3rd Qu.:0.18810  
##  Malaysia :11   Malaysia :11   Max.   :0.35078  
##  (Other)  :55   (Other)  :55   NA's   :66

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "pink",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.01_by_country.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 25 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 OKI     0.152 
##  2 HAI     0.0932
##  3 YUN     0.0828
##  4 KAT     0.140 
##  5 TAI     0.211 
##  6 HUN     0.104 
##  7 HAN     0.103 
##  8 KLP     0.130 
##  9 HOC     0.104 
## 10 QNC     0.153 
## # ℹ 15 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)

# Rename the Region2 column to Region
fst3 <- fst3 |>
  dplyr::rename(Region = Region2)

# check output
head(fst3)
## # A tibble: 6 × 7
##   Pop_City   Country Latitude Longitude Abbreviation Region mean_fst
##   <chr>      <chr>      <dbl>     <dbl> <chr>        <chr>     <dbl>
## 1 Hainan     China       19.2      110. HAI          East     0.0932
## 2 Yunnan     China       24.5      101. YUN          East     0.0828
## 3 Hunan      China       27.6      112. HUN          East     0.104 
## 4 Okinawa    Japan       26.5      128. OKI          East     0.152 
## 5 Kanazawa   Japan       36.6      137. KAN          East     0.198 
## 6 Utsunomiya Japan       36.6      140. UTS          East     0.173

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 8
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0932
##  2 Yunnan      China       24.5      101.  YUN          East     0.0828
##  3 Hunan       China       27.6      112.  HUN          East     0.104 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.152 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.173 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.157 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.211 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0978
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.228 
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>

Mean by country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 9
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0932
##  2 Yunnan      China       24.5      101.  YUN          East     0.0828
##  3 Hunan       China       27.6      112.  HUN          East     0.104 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.152 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.173 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.157 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.211 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0978
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.228 
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City
  )

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 11
##    City        Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0932
##  2 Yunnan      China       24.5      101.  YUN          East     0.0828
##  3 Hunan       China       27.6      112.  HUN          East     0.104 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.152 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.173 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.157 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.211 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0978
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.228 
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## #   Latitude_group <chr>, mean_fst_by_latitude <dbl>
fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
## # A tibble: 6 × 9
##   Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
##   <chr>                         <dbl> <chr>               <dbl> <chr>  
## 1 Above 30N                      0.18 East                 0.15 Japan  
## 2 Above 30N                      0.18 East                 0.15 Japan  
## 3 Above 30N                      0.18 East                 0.15 Japan  
## 4 Below 30N                      0.13 East                 0.15 China  
## 5 Below 30N                      0.13 East                 0.15 China  
## 6 Below 30N                      0.13 East                 0.15 China  
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## #   Abbreviation <chr>, mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values using SNPs after prunning with r2 0.01.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values using SNPs after prunning with r2 0.01.

Latitude_group

mean_fst_by_latitude

Region

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Above 30N

0.18

East

0.15

Japan

0.17

Kagoshima

KAG

0.16

Above 30N

0.18

East

0.15

Japan

0.17

Kanazawa

KAN

0.20

Above 30N

0.18

East

0.15

Japan

0.17

Utsunomiya

UTS

0.17

Below 30N

0.13

East

0.15

China

0.09

Hainan

HAI

0.09

Below 30N

0.13

East

0.15

China

0.09

Hunan

HUN

0.10

Below 30N

0.13

East

0.15

China

0.09

Yunnan

YUN

0.08

Below 30N

0.13

East

0.15

Japan

0.17

Okinawa

OKI

0.15

Below 30N

0.13

East

0.15

Taiwan

0.21

Tainan

TAI

0.21

Below 30N

0.13

South

0.16

India

0.10

Bengaluru

BEN

0.10

Below 30N

0.13

South

0.16

Maldives

0.23

Kunfunadhoo

KUN

0.23

Below 30N

0.13

South

0.16

Nepal

0.14

Kathmandu

KAT

0.14

Below 30N

0.13

Southeast

0.12

Cambodia

0.08

Phnom Penh

CAM

0.08

Below 30N

0.13

Southeast

0.12

Indonesia

0.16

Jakarta

INJ

0.16

Below 30N

0.13

Southeast

0.12

Indonesia

0.16

Sulawesi (Forest)

SUF

0.16

Below 30N

0.13

Southeast

0.12

Indonesia

0.16

Sulawesi (Urban)

SUU

0.12

Below 30N

0.13

Southeast

0.12

Indonesia

0.16

Wainyapu

INW

0.19

Below 30N

0.13

Southeast

0.12

Malaysia

0.11

Kuala Lumpur

KLP

0.13

Below 30N

0.13

Southeast

0.12

Malaysia

0.11

Tambun

MAT

0.09

Below 30N

0.13

Southeast

0.12

Thailand

0.09

Chanthaburi

CHA

0.08

Below 30N

0.13

Southeast

0.12

Thailand

0.09

Kanchanaburi

KAC

0.09

Below 30N

0.13

Southeast

0.12

Thailand

0.09

Lampang

LAM

0.09

Below 30N

0.13

Southeast

0.12

Thailand

0.09

Sisaket

SSK

0.08

Below 30N

0.13

Southeast

0.12

Vietnam

0.12

Hanoi

HAN

0.10

Below 30N

0.13

Southeast

0.12

Vietnam

0.12

Ho Chi Minh City

HOC

0.10

Below 30N

0.13

Southeast

0.12

Vietnam

0.12

Quy Nhon City

QNC

0.15

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "fst", 
  "fst_r2_0.01_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "fst", "r2_0.01_country.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           HAI      YUN       HUN      OKI       KAN       UTS       KAG
## HAI    0.0000 1035.308  965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081    0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN  965.7432 1107.931    0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283    0.000 1390.3468 1589.7383  617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347    0.0000  288.4938  790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738  288.4938    0.0000 1023.2713
##           TAI      BEN      KUN      KAT       CAM      SUF      SUU      INW
## HAI 1180.1356 3487.311 4241.329 2648.016  987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN  985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI  873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
##          INJ      KLP      MAT       SSK      KAC      CHA       LAM       HAN
## HAI 2844.575 1984.417 1861.932  722.0761 1218.270 1086.825 1070.0656  441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303  714.4744  601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029  953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
##          HOC       QNC
## HAI  990.911  604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

head(data)
##    Distance        FST row_index col_index
## 1    0.0000 0.00000000       HAI       HAI
## 2 1035.3081 0.08801929       HAI       YUN
## 3  965.7432 0.12717041       HAI       HUN
## 4 2047.0214 0.19706597       HAI       OKI
## 5 3269.9948 0.16437593       HAI       KAN
## 6 3527.0034 0.07411078       HAI       UTS

Fit linear regression

# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))


# Plot
ggplot(data, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(data$Distance) * 0.85, y = max(data$FST) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Distance (Km)",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_all_samples_LD1.pdf"),
  width = 6,
  height = 4,
  units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'

Subset by country Select countries with at least 3 sampling localities

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops
## [1] "China"     "Japan"     "Indonesia" "Thailand"  "Vietnam"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  subset_data <- subset_data[subset_data$Distance > 0, ]
  # Perform linear regression
  lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results
## $China
## $China$equation
## [1] "y = -0.64629x + 4.586"
## 
## $China$r2
## [1] "R^2 = 1.00"
## 
## 
## $Japan
## $Japan$equation
## [1] "y = 0.04826x + -0.142"
## 
## $Japan$r2
## [1] "R^2 = 0.10"
## 
## 
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.05298x + 0.518"
## 
## $Indonesia$r2
## [1] "R^2 = 0.15"
## 
## 
## $Thailand
## $Thailand$equation
## [1] "y = 0.29122x + -1.590"
## 
## $Thailand$r2
## [1] "R^2 = 0.40"
## 
## 
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.05230x + 0.485"
## 
## $Vietnam$r2
## [1] "R^2 = 0.16"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate the linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(
    data = regression_results, 
    aes(label = paste(equation, r2, sep = "~~")), 
    x = Inf, y = Inf, 
    vjust = 2, hjust = 1.2, 
    size = 3.5, 
    parse = TRUE, 
    inherit.aes = FALSE
  ) +
  labs(title = "FST vs Distance by Country",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_countries_LD1.pdf"),
  width = 6,
  height = 8,
  units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     BEN    CAM    CHA    HAI    HAN    HOC    HUN    INJ    INW    KAC   
## BEN NA     "0.03" "0.03" "0.07" "0.07" "0.08" "0.10" "0.13" "0.16" "0.02"
## CAM "2976" NA     "0.00" "0.04" "0.04" "0.05" "0.07" "0.12" "0.15" "0.00"
## CHA "2660" "329"  NA     "0.05" "0.05" "0.06" "0.08" "0.12" "0.15" "0.00"
## HAI "3487" "987"  "1087" NA     "0.04" "0.03" "0.04" "0.15" "0.19" "0.05"
## HAN "3134" "1059" "1019" "441"  NA     "0.04" "0.05" "0.16" "0.21" "0.05"
## HOC "3174" "209"  "537"  "991"  "1146" NA     "0.04" "0.16" "0.20" "0.06"
## HUN "3913" "1929" "1956" "966"  "954"  "1954" NA     "0.16" "0.20" "0.08"
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA     "0.06" "0.13"
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA     "0.16"
## KAC "2377" "647"  "320"  "1218" "1028" "856"  "1981" "2393" "3492" NA    
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713"
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456"
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119"
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234"
## KUN "957"  "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052"
## LAM "2419" "951"  "692"  "1070" "733"  "1139" "1637" "2844" "3859" "475" 
## MAT "2751" "875"  "893"  "1862" "1894" "912"  "2799" "1363" "2650" "1060"
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265"
## QNC "3424" "528"  "782"  "604"  "882"  "436"  "1565" "2241" "2880" "1047"
## SSK "2894" "402"  "368"  "722"  "678"  "547"  "1595" "2390" "3271" "530" 
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859"  "2874"
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973"  "3147"
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986"  "3562" "3635" "2397"
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722"
## YUN "2806" "1487" "1323" "1035" "601"  "1627" "1108" "3467" "4320" "1179"
##     KAG    KAN    KAT    KLP    KUN    LAM    MAT    OKI    QNC    SSK   
## BEN "0.16" "0.20" "0.10" "0.08" "0.13" "0.03" "0.04" "0.15" "0.10" "0.03"
## CAM "0.13" "0.17" "0.10" "0.05" "0.15" "0.01" "0.02" "0.11" "0.08" "0.01"
## CHA "0.14" "0.18" "0.10" "0.05" "0.15" "0.01" "0.02" "0.13" "0.08" "0.00"
## HAI "0.10" "0.14" "0.13" "0.09" "0.19" "0.06" "0.06" "0.09" "0.12" "0.06"
## HAN "0.12" "0.17" "0.13" "0.09" "0.22" "0.05" "0.06" "0.11" "0.13" "0.05"
## HOC "0.11" "0.15" "0.14" "0.10" "0.22" "0.07" "0.07" "0.10" "0.14" "0.06"
## HUN "0.07" "0.11" "0.15" "0.11" "0.21" "0.08" "0.09" "0.07" "0.14" "0.08"
## INJ "0.22" "0.26" "0.09" "0.17" "0.26" "0.13" "0.13" "0.21" "0.19" "0.12"
## INW "0.26" "0.32" "0.10" "0.23" "0.33" "0.16" "0.16" "0.26" "0.23" "0.15"
## KAC "0.14" "0.19" "0.10" "0.06" "0.17" "0.01" "0.02" "0.13" "0.08" "0.00"
## KAG NA     "0.12" "0.21" "0.18" "0.28" "0.15" "0.15" "0.12" "0.20" "0.14"
## KAN "791"  NA     "0.25" "0.23" "0.33" "0.19" "0.19" "0.16" "0.25" "0.18"
## KAT "4367" "4880" NA     "0.15" "0.23" "0.10" "0.11" "0.20" "0.17" "0.10"
## KLP "4376" "5163" "3242" NA     "0.24" "0.06" "0.06" "0.17" "0.14" "0.06"
## KUN "6631" "7300" "2779" "3196" NA     "0.17" "0.17" "0.27" "0.24" "0.16"
## LAM "3450" "4160" "1789" "1704" "3202" NA     "0.02" "0.13" "0.09" "0.01"
## MAT "4289" "5073" "3069" "177"  "3124" "1531" NA     "0.14" "0.10" "0.02"
## OKI "618"  "1390" "4204" "3822" "6262" "3060" "3747" NA     "0.19" "0.13"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA     "0.08"
## SSK "3232" "3991" "2412" "1365" "3582" "624"  "1217" "2758" "548"  NA    
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873"  "1548" "1886"
## UTS "1023" "288"  "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714"  "2208" "2678" "1449" "1087"
##     SUF    SUU    TAI    UTS    YUN    
## BEN "0.14" "0.11" "0.20" "0.17" "0.02" 
## CAM "0.13" "0.09" "0.17" "0.14" "0.00" 
## CHA "0.13" "0.09" "0.18" "0.16" "0.00" 
## HAI "0.15" "0.09" "0.14" "0.11" "0.05" 
## HAN "0.17" "0.10" "0.18" "0.14" "0.05" 
## HOC "0.16" "0.10" "0.15" "0.12" "0.06" 
## HUN "0.16" "0.10" "0.13" "0.09" "0.08" 
## INJ "0.08" "0.09" "0.27" "0.24" "0.12" 
## INW "0.09" "0.12" "0.34" "0.28" "0.15" 
## KAC "0.14" "0.10" "0.20" "0.17" "-0.00"
## KAG "0.23" "0.16" "0.18" "0.09" "0.14" 
## KAN "0.27" "0.21" "0.22" "0.10" "0.18" 
## KAT "0.06" "0.07" "0.26" "0.22" "0.10" 
## KLP "0.19" "0.14" "0.24" "0.20" "0.06" 
## KUN "0.29" "0.25" "0.35" "0.30" "0.16" 
## LAM "0.14" "0.10" "0.19" "0.17" "0.01" 
## MAT "0.14" "0.10" "0.19" "0.17" "0.02" 
## OKI "0.22" "0.15" "0.16" "0.13" "0.13" 
## QNC "0.21" "0.17" "0.25" "0.22" "0.08" 
## SSK "0.13" "0.10" "0.18" "0.16" "0.00" 
## SUF NA     "0.04" "0.28" "0.24" "0.13" 
## SUU "381"  NA     "0.22" "0.18" "0.10" 
## TAI "2777" "2747" NA     "0.20" "0.18" 
## UTS "4755" "4566" "2418" NA     "0.16" 
## YUN "3566" "3744" "1928" "3902" NA
cities <- readRDS(here("output", "populations", "cities.rds"))
head(cities)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     SSK    QNC    CAM    HOC    CHA    HAI    HAN    KAC    LAM    YUN   
## SSK NA     "0.08" "0.01" "0.06" "0.00" "0.06" "0.05" "0.00" "0.01" "0.00"
## QNC "548"  NA     "0.08" "0.14" "0.08" "0.12" "0.13" "0.08" "0.09" "0.08"
## CAM "402"  "528"  NA     "0.05" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## HOC "547"  "436"  "209"  NA     "0.06" "0.03" "0.04" "0.06" "0.07" "0.06"
## CHA "368"  "782"  "329"  "537"  NA     "0.05" "0.05" "0.00" "0.01" "0.00"
## HAI "722"  "604"  "987"  "991"  "1087" NA     "0.04" "0.05" "0.06" "0.05"
## HAN "678"  "882"  "1059" "1146" "1019" "441"  NA     "0.05" "0.05" "0.05"
## KAC "530"  "1047" "647"  "856"  "320"  "1218" "1028" NA     "0.01" "0.00"
## LAM "624"  "1155" "951"  "1139" "692"  "1070" "733"  "475"  NA     "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601"  "1179" "714"  NA    
## MAT "1217" "1348" "875"  "912"  "893"  "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966"  "954"  "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
##     MAT    HUN    KLP    TAI    OKI    SUF    INJ    SUU    KAT    KAG   
## SSK "0.02" "0.08" "0.06" "0.18" "0.13" "0.13" "0.12" "0.10" "0.10" "0.14"
## QNC "0.10" "0.14" "0.14" "0.25" "0.19" "0.21" "0.19" "0.17" "0.17" "0.20"
## CAM "0.02" "0.07" "0.05" "0.17" "0.11" "0.13" "0.12" "0.09" "0.10" "0.13"
## HOC "0.07" "0.04" "0.10" "0.15" "0.10" "0.16" "0.16" "0.10" "0.14" "0.11"
## CHA "0.02" "0.08" "0.05" "0.18" "0.13" "0.13" "0.12" "0.09" "0.10" "0.14"
## HAI "0.06" "0.04" "0.09" "0.14" "0.09" "0.15" "0.15" "0.09" "0.13" "0.10"
## HAN "0.06" "0.05" "0.09" "0.18" "0.11" "0.17" "0.16" "0.10" "0.13" "0.12"
## KAC "0.02" "0.08" "0.06" "0.20" "0.13" "0.14" "0.13" "0.10" "0.10" "0.14"
## LAM "0.02" "0.08" "0.06" "0.19" "0.13" "0.14" "0.13" "0.10" "0.10" "0.15"
## YUN "0.02" "0.08" "0.06" "0.18" "0.13" "0.13" "0.12" "0.10" "0.10" "0.14"
## MAT NA     "0.09" "0.06" "0.19" "0.14" "0.14" "0.13" "0.10" "0.11" "0.15"
## HUN "2799" NA     "0.11" "0.13" "0.07" "0.16" "0.16" "0.10" "0.15" "0.07"
## KLP "177"  "2932" NA     "0.24" "0.17" "0.19" "0.17" "0.14" "0.15" "0.18"
## TAI "2894" "986"  "2979" NA     "0.16" "0.28" "0.27" "0.22" "0.26" "0.18"
## OKI "3747" "1599" "3822" "873"  NA     "0.22" "0.21" "0.15" "0.20" "0.12"
## SUF "2220" "3406" "2115" "2777" "3280" NA     "0.08" "0.04" "0.06" "0.23"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA     "0.09" "0.09" "0.22"
## SUU "2565" "3468" "2469" "2747" "3154" "381"  "1911" NA     "0.07" "0.16"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA     "0.21"
## KAG "4289" "1860" "4376" "1397" "618"  "3893" "4901" "3756" "4367" NA    
## INW "2650" "4248" "2505" "3635" "4110" "859"  "1528" "973"  "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791" 
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
##     INW    BEN    KAN    UTS    KUN   
## SSK "0.15" "0.03" "0.18" "0.16" "0.16"
## QNC "0.23" "0.10" "0.25" "0.22" "0.24"
## CAM "0.15" "0.03" "0.17" "0.14" "0.15"
## HOC "0.20" "0.08" "0.15" "0.12" "0.22"
## CHA "0.15" "0.03" "0.18" "0.16" "0.15"
## HAI "0.19" "0.07" "0.14" "0.11" "0.19"
## HAN "0.21" "0.07" "0.17" "0.14" "0.22"
## KAC "0.16" "0.02" "0.19" "0.17" "0.17"
## LAM "0.16" "0.03" "0.19" "0.17" "0.17"
## YUN "0.15" "0.02" "0.18" "0.16" "0.16"
## MAT "0.16" "0.04" "0.19" "0.17" "0.17"
## HUN "0.20" "0.10" "0.11" "0.09" "0.21"
## KLP "0.23" "0.08" "0.23" "0.20" "0.24"
## TAI "0.34" "0.20" "0.22" "0.20" "0.35"
## OKI "0.26" "0.15" "0.16" "0.13" "0.27"
## SUF "0.09" "0.14" "0.27" "0.24" "0.29"
## INJ "0.06" "0.13" "0.26" "0.24" "0.26"
## SUU "0.12" "0.11" "0.21" "0.18" "0.25"
## KAT "0.10" "0.10" "0.25" "0.22" "0.23"
## KAG "0.26" "0.16" "0.12" "0.09" "0.28"
## INW NA     "0.16" "0.32" "0.28" "0.33"
## BEN "5347" NA     "0.20" "0.17" "0.13"
## KAN "5424" "6410" NA     "0.10" "0.33"
## UTS "5539" "6696" "288"  NA     "0.30"
## KUN "5513" "957"  "7300" "7583" NA

Make a table and save as word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]


# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))

ft

Population

SSK

QNC

CAM

HOC

CHA

HAI

HAN

KAC

LAM

YUN

MAT

HUN

KLP

TAI

OKI

SUF

INJ

SUU

KAT

KAG

INW

BEN

KAN

UTS

KUN

SSK

0.08

0.01

0.06

0.00

0.06

0.05

0.00

0.01

0.00

0.02

0.08

0.06

0.18

0.13

0.13

0.12

0.10

0.10

0.14

0.15

0.03

0.18

0.16

0.16

QNC

548

0.08

0.14

0.08

0.12

0.13

0.08

0.09

0.08

0.10

0.14

0.14

0.25

0.19

0.21

0.19

0.17

0.17

0.20

0.23

0.10

0.25

0.22

0.24

CAM

402

528

0.05

0.00

0.04

0.04

0.00

0.01

0.00

0.02

0.07

0.05

0.17

0.11

0.13

0.12

0.09

0.10

0.13

0.15

0.03

0.17

0.14

0.15

HOC

547

436

209

0.06

0.03

0.04

0.06

0.07

0.06

0.07

0.04

0.10

0.15

0.10

0.16

0.16

0.10

0.14

0.11

0.20

0.08

0.15

0.12

0.22

CHA

368

782

329

537

0.05

0.05

0.00

0.01

0.00

0.02

0.08

0.05

0.18

0.13

0.13

0.12

0.09

0.10

0.14

0.15

0.03

0.18

0.16

0.15

HAI

722

604

987

991

1087

0.04

0.05

0.06

0.05

0.06

0.04

0.09

0.14

0.09

0.15

0.15

0.09

0.13

0.10

0.19

0.07

0.14

0.11

0.19

HAN

678

882

1059

1146

1019

441

0.05

0.05

0.05

0.06

0.05

0.09

0.18

0.11

0.17

0.16

0.10

0.13

0.12

0.21

0.07

0.17

0.14

0.22

KAC

530

1047

647

856

320

1218

1028

0.01

0.00

0.02

0.08

0.06

0.20

0.13

0.14

0.13

0.10

0.10

0.14

0.16

0.02

0.19

0.17

0.17

LAM

624

1155

951

1139

692

1070

733

475

0.01

0.02

0.08

0.06

0.19

0.13

0.14

0.13

0.10

0.10

0.15

0.16

0.03

0.19

0.17

0.17

YUN

1087

1449

1487

1627

1323

1035

601

1179

714

0.02

0.08

0.06

0.18

0.13

0.13

0.12

0.10

0.10

0.14

0.15

0.02

0.18

0.16

0.16

MAT

1217

1348

875

912

893

1862

1894

1060

1531

2208

0.09

0.06

0.19

0.14

0.14

0.13

0.10

0.11

0.15

0.16

0.04

0.19

0.17

0.17

HUN

1595

1565

1929

1954

1956

966

954

1981

1637

1108

2799

0.11

0.13

0.07

0.16

0.16

0.10

0.15

0.07

0.20

0.10

0.11

0.09

0.21

KLP

1365

1445

1003

1011

1055

1984

2042

1234

1704

2375

177

2932

0.24

0.17

0.19

0.17

0.14

0.15

0.18

0.23

0.08

0.23

0.20

0.24

TAI

1886

1548

2061

1982

2237

1180

1497

2397

2219

1928

2894

986

2979

0.16

0.28

0.27

0.22

0.26

0.18

0.34

0.20

0.22

0.20

0.35

OKI

2758

2410

2929

2839

3110

2047

2328

3265

3060

2678

3747

1599

3822

873

0.22

0.21

0.15

0.20

0.12

0.26

0.15

0.16

0.13

0.27

SUF

2567

2117

2245

2046

2557

2614

2986

2874

3182

3566

2220

3406

2115

2777

3280

0.08

0.04

0.06

0.23

0.09

0.14

0.27

0.24

0.29

INJ

2390

2241

1989

1889

2160

2845

3034

2393

2844

3467

1363

3805

1188

3562

4294

1535

0.09

0.09

0.22

0.06

0.13

0.26

0.24

0.26

SUU

2797

2309

2506

2300

2827

2752

3151

3147

3420

3744

2565

3468

2469

2747

3154

381

1911

0.07

0.16

0.12

0.11

0.21

0.18

0.25

KAT

2412

2922

2724

2921

2424

2648

2207

2119

1789

1640

3069

2611

3242

3537

4204

4967

4429

5209

0.21

0.10

0.10

0.25

0.22

0.23

KAG

3232

2942

3446

3377

3595

2510

2721

3713

3450

2967

4289

1860

4376

1397

618

3893

4901

3756

4367

0.26

0.16

0.12

0.09

0.28

INW

3271

2880

2908

2727

3190

3419

3762

3492

3859

4320

2650

4248

2505

3635

4110

859

1528

973

5611

4718

0.16

0.32

0.28

0.33

BEN

2894

3424

2976

3174

2660

3487

3134

2377

2419

2806

2751

3913

2869

4631

5450

4970

3876

5307

1828

5773

5347

0.20

0.17

0.13

KAN

3991

3725

4223

4161

4357

3270

3444

4456

4160

3619

5073

2532

5163

2185

1390

4621

5685

4452

4880

791

5424

6410

0.10

0.33

UTS

4249

3965

4470

4399

4613

3527

3715

4722

4435

3902

5311

2811

5396

2418

1590

4755

5874

4566

5167

1023

5539

6696

288

0.30

KUN

3582

4070

3572

3749

3288

4241

3934

3052

3202

3681

3124

4777

3196

5414

6262

5293

3986

5658

2779

6631

5513

957

7300

7583

# Create a new Word document
doc <- read_docx()

# Add the flextable to the Word document
doc <- body_add_flextable(doc, ft)

# Save the Word document
print(doc, target =  here("output", "fst", "LD1.docx"))

3. SNP set r2 0.1

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  6244094 333.5   11233748 600.0         NA 10670513 569.9
## Vcells 10548729  80.5   18029388 137.6      32768 14957815 114.2
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.1 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.1.log
## 58152 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.972163.
## 58152 variants and 230 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

r2_0.1 <-
  read.PLINK(
    here(
      "output", "fst", "r2_0.1.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

summary(r2_0.1)

This chunk will take a couple minutes to run.

# convert
r2_0.1_2 <- stamppConvert(r2_0.1, type="genlight")

# run stampp. If you want to runn with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
r2_0.1_3 <- stamppFst(r2_0.1_2, 1, 95, 1)

Save it

saveRDS(
  r2_0.1_3, here(
    "output", "fst", "r2_0.1.rds"
  )
)

To load it

r2_0.1_3 <- readRDS(
  here(
    "output", "fst", "r2_0.1.rds"
  )
)

Now lets look at the object

summary(r2_0.1_3)
##       OKI               HAI               YUN                 KAT         
##  Min.   :0.07442   Min.   :0.03415   Min.   :-0.000061   Min.   :0.06197  
##  1st Qu.:0.11875   1st Qu.:0.04948   1st Qu.: 0.020653   1st Qu.:0.10390  
##  Median :0.13231   Median :0.08208   Median : 0.074156   Median :0.11475  
##  Mean   :0.14973   Mean   :0.09324   Mean   : 0.081748   Mean   :0.14294  
##  3rd Qu.:0.17241   3rd Qu.:0.13763   3rd Qu.: 0.138696   3rd Qu.:0.17543  
##  Max.   :0.26203   Max.   :0.19423   Max.   : 0.184499   Max.   :0.26064  
##  NA's   :1         NA's   :2         NA's   :3           NA's   :4        
##       TAI              HUN               HAN               KLP         
##  Min.   :0.1335   Min.   :0.04272   Min.   :0.03747   Min.   :0.05153  
##  1st Qu.:0.1827   1st Qu.:0.07029   1st Qu.:0.04754   1st Qu.:0.05847  
##  Median :0.1963   Median :0.08814   Median :0.09929   Median :0.13852  
##  Mean   :0.2179   Mean   :0.10291   Mean   :0.10694   Mean   :0.13005  
##  3rd Qu.:0.2400   3rd Qu.:0.12577   3rd Qu.:0.15623   3rd Qu.:0.18669  
##  Max.   :0.3444   Max.   :0.20465   Max.   :0.22103   Max.   :0.24103  
##  NA's   :5        NA's   :6         NA's   :7         NA's   :8        
##       HOC               QNC               SSK                KAC          
##  Min.   :0.04642   Min.   :0.07948   Min.   :0.002698   Min.   :0.001375  
##  1st Qu.:0.06326   1st Qu.:0.08570   1st Qu.:0.011550   1st Qu.:0.018868  
##  Median :0.10400   Median :0.17367   Median :0.113973   Median :0.132199  
##  Mean   :0.11322   Mean   :0.15507   Mean   :0.087070   Mean   :0.095838  
##  3rd Qu.:0.15567   3rd Qu.:0.21453   3rd Qu.:0.147997   3rd Qu.:0.153453  
##  Max.   :0.21166   Max.   :0.24452   Max.   :0.179032   Max.   :0.184788  
##  NA's   :9         NA's   :10        NA's   :11         NA's   :12        
##       CHA                KAN              UTS               CAM          
##  Min.   :0.004038   Min.   :0.1031   Min.   :0.08997   Min.   :0.008172  
##  1st Qu.:0.023675   1st Qu.:0.1759   1st Qu.:0.15738   1st Qu.:0.025363  
##  Median :0.128982   Median :0.1946   Median :0.17412   Median :0.121331  
##  Mean   :0.099625   Mean   :0.2137   Mean   :0.19413   Mean   :0.094094  
##  3rd Qu.:0.149675   3rd Qu.:0.2701   3rd Qu.:0.24146   3rd Qu.:0.137461  
##  Max.   :0.177767   Max.   :0.3177   Max.   :0.28404   Max.   :0.161974  
##  NA's   :13         NA's   :14       NA's   :15        NA's   :16        
##       KAG              BEN               LAM               SUF         
##  Min.   :0.1362   Min.   :0.02818   Min.   :0.01876   Min.   :0.04022  
##  1st Qu.:0.1471   1st Qu.:0.07735   1st Qu.:0.10946   1st Qu.:0.07931  
##  Median :0.1938   Median :0.13012   Median :0.13912   Median :0.09606  
##  Mean   :0.1963   Mean   :0.11092   Mean   :0.12277   Mean   :0.13239  
##  3rd Qu.:0.2366   3rd Qu.:0.14495   3rd Qu.:0.16100   3rd Qu.:0.15057  
##  Max.   :0.2646   Max.   :0.17353   Max.   :0.17178   Max.   :0.29578  
##  NA's   :17       NA's   :18        NA's   :19        NA's   :20       
##       SUU               KUN              INW               INJ        
##  Min.   :0.08678   Min.   :0.1681   Min.   :0.06515   Min.   :0.1366  
##  1st Qu.:0.10211   1st Qu.:0.2155   1st Qu.:0.09282   1st Qu.:0.1366  
##  Median :0.11800   Median :0.2629   Median :0.12049   Median :0.1366  
##  Mean   :0.14286   Mean   :0.2578   Mean   :0.12049   Mean   :0.1366  
##  3rd Qu.:0.15875   3rd Qu.:0.3027   3rd Qu.:0.14816   3rd Qu.:0.1366  
##  Max.   :0.24867   Max.   :0.3425   Max.   :0.17582   Max.   :0.1366  
##  NA's   :21        NA's   :22       NA's   :23        NA's   :24      
##       MAT     
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :25

If you want you can save the fst values as csv.

# Convert to data frame
r2_0.1_df <- data.frame(r2_0.1_3)

# Save it
write.csv(r2_0.1_df, file = here("output", "fst", "r2_0.1_df.csv"))

Check the Fst values

head(r2_0.1_df)
##            OKI        HAI        YUN       KAT       TAI HUN HAN KLP HOC QNC
## OKI         NA         NA         NA        NA        NA  NA  NA  NA  NA  NA
## HAI 0.08802883         NA         NA        NA        NA  NA  NA  NA  NA  NA
## YUN 0.11878133 0.04941474         NA        NA        NA  NA  NA  NA  NA  NA
## KAT 0.19738113 0.13464192 0.10181861        NA        NA  NA  NA  NA  NA  NA
## TAI 0.16804359 0.14154214 0.18449852 0.2606445        NA  NA  NA  NA  NA  NA
## HUN 0.07441617 0.03534541 0.06802916 0.1476221 0.1334822  NA  NA  NA  NA  NA
##     SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## YUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KAT  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Now lets get the Fst values from the object albo3. It has the bootstraps, CI limits, p-values, and Fst values. We will convert the data into a matrix.

aa <- as.matrix(r2_0.1_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##            OKI        HAI        YUN       KAT       TAI        HUN        HAN
## OKI         NA 0.08802883 0.11878133 0.1973811 0.1680436 0.07441617 0.10763474
## HAI 0.08802883         NA 0.04941474 0.1346419 0.1415421 0.03534541 0.03825566
## YUN 0.11878133 0.04941474         NA 0.1018186 0.1844985 0.06802916 0.04338869
## KAT 0.19738113 0.13464192 0.10181861        NA 0.2606445 0.14762208 0.13924461
## TAI 0.16804359 0.14154214 0.18449852 0.2606445        NA 0.13348223 0.18073656
## HUN 0.07441617 0.03534541 0.06802916 0.1476221 0.1334822         NA 0.04712316
##            KLP        HOC       QNC         SSK           KAC        CHA
## OKI 0.15791274 0.09583416 0.1855034 0.120337901  1.222639e-01 0.11864070
## HAI 0.08207801 0.03414611 0.1192940 0.051748990  4.855596e-02 0.04954779
## YUN 0.05497547 0.05872326 0.0802834 0.003470757 -6.095245e-05 0.00216753
## KAT 0.14992931 0.14537279 0.1754255 0.103904742  1.043844e-01 0.10291812
## TAI 0.23665704 0.15932868 0.2500647 0.183363174  1.938385e-01 0.18037855
## HUN 0.09955817 0.04272361 0.1355892 0.070972254  6.854084e-02 0.06961833
##           KAN        UTS         CAM        KAG        BEN         LAM
## OKI 0.1661065 0.13562198 0.109049041 0.12082719 0.13838680 0.123949616
## HAI 0.1406218 0.11163440 0.041994919 0.09877636 0.06924561 0.053818795
## YUN 0.1797822 0.14989155 0.003581621 0.13178798 0.02349872 0.006781344
## KAT 0.2507648 0.22214457 0.101256096 0.20504141 0.10811511 0.106234969
## TAI 0.2282426 0.19867600 0.171069919 0.18782724 0.20034447 0.189514500
## HUN 0.1159453 0.08822364 0.059878361 0.07511422 0.08813833 0.073799872
##            SUF        SUU       KUN       INW        INJ        MAT
## OKI 0.22063500 0.15757698 0.2620293 0.2615279 0.21402287 0.12899644
## HAI 0.15560449 0.09438831 0.1867310 0.1942290 0.15201957 0.06079123
## YUN 0.14099822 0.09767201 0.1564464 0.1629609 0.12806009 0.01970472
## KAT 0.06196652 0.07006897 0.2304904 0.1073455 0.09416537 0.11474970
## TAI 0.29024752 0.22479493 0.3443908 0.3418771 0.27387778 0.18974486
## HUN 0.16664122 0.10268620 0.2017426 0.2046510 0.16392725 0.08049047

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    Region2, Country
  )

# Check it
head(sampling_loc)
## # A tibble: 6 × 7
##   Pop_City Country Latitude Longitude Region Abbreviation Region2  
##   <chr>    <chr>      <dbl>     <dbl> <chr>  <chr>        <chr>    
## 1 Hainan   China       19.2      110. Asia   HAI          East Asia
## 2 Yunnan   China       24.5      101. Asia   YUN          East Asia
## 3 Hunan    China       27.6      112. Asia   HUN          East Asia
## 4 Okinawa  Japan       26.5      128. Asia   OKI          East Asia
## 5 Sendai   Japan       38.3      141. Asia   SEN          East Asia
## 6 Nagasaki Japan       32.8      130. Asia   NAG          East Asia

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##  [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"

Create vector with order of populations

# Extract the populations that appear in neutral_df
populations_in_r2_0.1 <- colnames(r2_0.1_df)

# Reorder the populations based on order_pops
poporder <- populations_in_r2_0.1[populations_in_r2_0.1 %in% order_pops]

# Print the reordered populations
print(poporder)
##  [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1          Var2         value        
##  OKI    : 25   OKI    : 25   Min.   :-0.0001  
##  HAI    : 25   HAI    : 25   1st Qu.: 0.0749  
##  YUN    : 25   YUN    : 25   Median : 0.1320  
##  KAT    : 25   KAT    : 25   Mean   : 0.1313  
##  TAI    : 25   TAI    : 25   3rd Qu.: 0.1763  
##  HUN    : 25   HUN    : 25   Max.   : 0.3444  
##  (Other):475   (Other):475   NA's   :325

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.1.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

By country

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(r2_0.1_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##              Cambodia      China      India  Indonesia     Japan   Malaysia
## Cambodia           NA 0.03515163 0.02536262 0.12941643 0.1297409 0.03646673
## China     0.035151634         NA 0.06029422 0.14698652 0.1075254 0.06626635
## India     0.025362622 0.06029422         NA 0.14413628 0.1619263 0.05902305
## Indonesia 0.129416433 0.14698652 0.14413628         NA 0.2343967 0.16434783
## Japan     0.129740854 0.10752536 0.16192628 0.23439668        NA 0.17315795
## Malaysia  0.036466732 0.06626635 0.05902305 0.16434783 0.1731579         NA
## Maldives  0.152913626 0.18164001 0.13012441 0.28745228 0.2816610 0.20454714
## Nepal     0.101256096 0.13292617 0.10811511 0.08338658 0.2116824 0.13233951
## Taiwan    0.171069919 0.14825128 0.20034447 0.28269932 0.1864794 0.21320095
## Thailand  0.006041354 0.04158013 0.02653178 0.13544451 0.1460455 0.03823662
## Vietnam   0.054734247 0.06661412 0.08278023 0.17914574 0.1451190 0.09840066
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1529136 0.10125610 0.1710699 0.006041354 0.05473425
## China     0.1816400 0.13292617 0.1482513 0.041580126 0.06661412
## India     0.1301244 0.10811511 0.2003445 0.026531776 0.08278023
## Indonesia 0.2874523 0.08338658 0.2826993 0.135444507 0.17914574
## Japan     0.2816610 0.21168235 0.1864794 0.146045512 0.14511899
## Malaysia  0.2045471 0.13233951 0.2132009 0.038236622 0.09840066
## Maldives         NA 0.23049036 0.3443908 0.161208330 0.22357228
## Nepal     0.2304904         NA 0.2606445 0.104360558 0.15334765
## Taiwan    0.3443908 0.26064445        NA 0.186773674 0.19670999
## Thailand  0.1612083 0.10436056 0.1867737          NA 0.06323481
## Vietnam   0.2235723 0.15334765 0.1967100 0.063234806         NA
symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)
##           Cambodia      China      India Indonesia     Japan   Malaysia
## Cambodia        NA 0.03515163 0.02536262 0.1294164 0.1297409 0.03646673
## China           NA         NA 0.06029422 0.1469865 0.1075254 0.06626635
## India           NA         NA         NA 0.1441363 0.1619263 0.05902305
## Indonesia       NA         NA         NA        NA 0.2343967 0.16434783
## Japan           NA         NA         NA        NA        NA 0.17315795
## Malaysia        NA         NA         NA        NA        NA         NA
## Maldives        NA         NA         NA        NA        NA         NA
## Nepal           NA         NA         NA        NA        NA         NA
## Taiwan          NA         NA         NA        NA        NA         NA
## Thailand        NA         NA         NA        NA        NA         NA
## Vietnam         NA         NA         NA        NA        NA         NA
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1529136 0.10125610 0.1710699 0.006041354 0.05473425
## China     0.1816400 0.13292617 0.1482513 0.041580126 0.06661412
## India     0.1301244 0.10811511 0.2003445 0.026531776 0.08278023
## Indonesia 0.2874523 0.08338658 0.2826993 0.135444507 0.17914574
## Japan     0.2816610 0.21168235 0.1864794 0.146045512 0.14511899
## Malaysia  0.2045471 0.13233951 0.2132009 0.038236622 0.09840066
## Maldives         NA 0.23049036 0.3443908 0.161208330 0.22357228
## Nepal            NA         NA 0.2606445 0.104360558 0.15334765
## Taiwan           NA         NA        NA 0.186773674 0.19670999
## Thailand         NA         NA        NA          NA 0.06323481
## Vietnam          NA         NA        NA          NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)
##         Var1           Var2        value        
##  Cambodia :11   Cambodia :11   Min.   :0.00604  
##  China    :11   China    :11   1st Qu.:0.08308  
##  India    :11   India    :11   Median :0.14512  
##  Indonesia:11   Indonesia:11   Mean   :0.14236  
##  Japan    :11   Japan    :11   3rd Qu.:0.18663  
##  Malaysia :11   Malaysia :11   Max.   :0.34439  
##  (Other)  :55   (Other)  :55   NA's   :66

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "orange",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.1_by_country.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 25 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 OKI     0.150 
##  2 HAI     0.0930
##  3 YUN     0.0819
##  4 KAT     0.143 
##  5 TAI     0.213 
##  6 HUN     0.101 
##  7 HAN     0.103 
##  8 KLP     0.128 
##  9 HOC     0.104 
## 10 QNC     0.153 
## # ℹ 15 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)

# Rename the Region2 column to Region
fst3 <- fst3 |>
  dplyr::rename(Region = Region2)

# check output
head(fst3)
## # A tibble: 6 × 7
##   Pop_City   Country Latitude Longitude Abbreviation Region mean_fst
##   <chr>      <chr>      <dbl>     <dbl> <chr>        <chr>     <dbl>
## 1 Hainan     China       19.2      110. HAI          East     0.0930
## 2 Yunnan     China       24.5      101. YUN          East     0.0819
## 3 Hunan      China       27.6      112. HUN          East     0.101 
## 4 Okinawa    Japan       26.5      128. OKI          East     0.150 
## 5 Kanazawa   Japan       36.6      137. KAN          East     0.198 
## 6 Utsunomiya Japan       36.6      140. UTS          East     0.169

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 8
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0930
##  2 Yunnan      China       24.5      101.  YUN          East     0.0819
##  3 Hunan       China       27.6      112.  HUN          East     0.101 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.150 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.169 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.154 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.213 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0974
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.225 
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>

Mean by country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 9
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0930
##  2 Yunnan      China       24.5      101.  YUN          East     0.0819
##  3 Hunan       China       27.6      112.  HUN          East     0.101 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.150 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.169 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.154 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.213 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0974
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.225 
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City
  )

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 25 × 11
##    City        Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0930
##  2 Yunnan      China       24.5      101.  YUN          East     0.0819
##  3 Hunan       China       27.6      112.  HUN          East     0.101 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.150 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.169 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.154 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.213 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0974
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.225 
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## #   Latitude_group <chr>, mean_fst_by_latitude <dbl>
fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
## # A tibble: 6 × 9
##   Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
##   <chr>                         <dbl> <chr>               <dbl> <chr>  
## 1 Above 30N                      0.17 East                 0.14 Japan  
## 2 Above 30N                      0.17 East                 0.14 Japan  
## 3 Above 30N                      0.17 East                 0.14 Japan  
## 4 Below 30N                      0.13 East                 0.14 China  
## 5 Below 30N                      0.13 East                 0.14 China  
## 6 Below 30N                      0.13 East                 0.14 China  
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## #   Abbreviation <chr>, mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values using SNPs after prunning with r2 0.1.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values using SNPs after prunning with r2 0.1.

Latitude_group

mean_fst_by_latitude

Region

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Above 30N

0.17

East

0.14

Japan

0.17

Kagoshima

KAG

0.15

Above 30N

0.17

East

0.14

Japan

0.17

Kanazawa

KAN

0.20

Above 30N

0.17

East

0.14

Japan

0.17

Utsunomiya

UTS

0.17

Below 30N

0.13

East

0.14

China

0.09

Hainan

HAI

0.09

Below 30N

0.13

East

0.14

China

0.09

Hunan

HUN

0.10

Below 30N

0.13

East

0.14

China

0.09

Yunnan

YUN

0.08

Below 30N

0.13

East

0.14

Japan

0.17

Okinawa

OKI

0.15

Below 30N

0.13

East

0.14

Taiwan

0.21

Tainan

TAI

0.21

Below 30N

0.13

South

0.16

India

0.10

Bengaluru

BEN

0.10

Below 30N

0.13

South

0.16

Maldives

0.23

Kunfunadhoo

KUN

0.23

Below 30N

0.13

South

0.16

Nepal

0.14

Kathmandu

KAT

0.14

Below 30N

0.13

Southeast

0.12

Cambodia

0.08

Phnom Penh

CAM

0.08

Below 30N

0.13

Southeast

0.12

Indonesia

0.17

Jakarta

INJ

0.16

Below 30N

0.13

Southeast

0.12

Indonesia

0.17

Sulawesi (Forest)

SUF

0.17

Below 30N

0.13

Southeast

0.12

Indonesia

0.17

Sulawesi (Urban)

SUU

0.13

Below 30N

0.13

Southeast

0.12

Indonesia

0.17

Wainyapu

INW

0.20

Below 30N

0.13

Southeast

0.12

Malaysia

0.11

Kuala Lumpur

KLP

0.13

Below 30N

0.13

Southeast

0.12

Malaysia

0.11

Tambun

MAT

0.09

Below 30N

0.13

Southeast

0.12

Thailand

0.08

Chanthaburi

CHA

0.08

Below 30N

0.13

Southeast

0.12

Thailand

0.08

Kanchanaburi

KAC

0.08

Below 30N

0.13

Southeast

0.12

Thailand

0.08

Lampang

LAM

0.09

Below 30N

0.13

Southeast

0.12

Thailand

0.08

Sisaket

SSK

0.08

Below 30N

0.13

Southeast

0.12

Vietnam

0.12

Hanoi

HAN

0.10

Below 30N

0.13

Southeast

0.12

Vietnam

0.12

Ho Chi Minh City

HOC

0.10

Below 30N

0.13

Southeast

0.12

Vietnam

0.12

Quy Nhon City

QNC

0.15

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "fst", 
  "fst_r2_0.1_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "fst", "r2_0.1_country.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           HAI      YUN       HUN      OKI       KAN       UTS       KAG
## HAI    0.0000 1035.308  965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081    0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN  965.7432 1107.931    0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283    0.000 1390.3468 1589.7383  617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347    0.0000  288.4938  790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738  288.4938    0.0000 1023.2713
##           TAI      BEN      KUN      KAT       CAM      SUF      SUU      INW
## HAI 1180.1356 3487.311 4241.329 2648.016  987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN  985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI  873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
##          INJ      KLP      MAT       SSK      KAC      CHA       LAM       HAN
## HAI 2844.575 1984.417 1861.932  722.0761 1218.270 1086.825 1070.0656  441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303  714.4744  601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029  953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
##          HOC       QNC
## HAI  990.911  604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

head(data)
##    Distance        FST row_index col_index
## 1    0.0000 0.00000000       HAI       HAI
## 2 1035.3081 0.08802883       HAI       YUN
## 3  965.7432 0.11878133       HAI       HUN
## 4 2047.0214 0.19738113       HAI       OKI
## 5 3269.9948 0.16804359       HAI       KAN
## 6 3527.0034 0.07441617       HAI       UTS

Fit linear regression

# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14
                                   ))  # Increase font size for y-axis
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_all_samples_LD2.pdf"),
  width = 5,
  height = 5,
  units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'

Subset by country Select coutries with at least 3 sampling localities

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops
## [1] "China"     "Japan"     "Indonesia" "Thailand"  "Vietnam"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  subset_data <- subset_data[subset_data$Distance > 0, ]
  # Perform linear regression
  lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results
## $China
## $China$equation
## [1] "y = -0.60267x + 4.278"
## 
## $China$r2
## [1] "R^2 = 1.00"
## 
## 
## $Japan
## $Japan$equation
## [1] "y = 0.04552x + -0.120"
## 
## $Japan$r2
## [1] "R^2 = 0.08"
## 
## 
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.05263x + 0.512"
## 
## $Indonesia$r2
## [1] "R^2 = 0.16"
## 
## 
## $Thailand
## $Thailand$equation
## [1] "y = 0.28828x + -1.569"
## 
## $Thailand$r2
## [1] "R^2 = 0.38"
## 
## 
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.05628x + 0.520"
## 
## $Vietnam$r2
## [1] "R^2 = 0.15"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate the linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(
    data = regression_results, 
    aes(label = paste(equation, r2, sep = "~~")), 
    x = Inf, y = Inf, 
    vjust = 2, hjust = 1.2, 
    size = 3.5, 
    parse = TRUE, 
    inherit.aes = FALSE
  ) +
  labs(title = "Fst vs Distance by Country",
       x = "Log(Distance)",
       y = "Fst(1-Fst)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_countries_LD2.pdf"),
  width = 6,
  height = 8,
  units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     BEN    CAM    CHA    HAI    HAN    HOC    HUN    INJ    INW    KAC   
## BEN NA     "0.03" "0.02" "0.07" "0.07" "0.08" "0.09" "0.14" "0.17" "0.02"
## CAM "2976" NA     "0.00" "0.04" "0.04" "0.05" "0.06" "0.13" "0.16" "0.00"
## CHA "2660" "329"  NA     "0.05" "0.05" "0.06" "0.07" "0.13" "0.16" "0.00"
## HAI "3487" "987"  "1087" NA     "0.04" "0.03" "0.04" "0.15" "0.19" "0.05"
## HAN "3134" "1059" "1019" "441"  NA     "0.05" "0.05" "0.16" "0.22" "0.04"
## HOC "3174" "209"  "537"  "991"  "1146" NA     "0.04" "0.17" "0.21" "0.06"
## HUN "3913" "1929" "1956" "966"  "954"  "1954" NA     "0.16" "0.20" "0.07"
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA     "0.07" "0.13"
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA     "0.17"
## KAC "2377" "647"  "320"  "1218" "1028" "856"  "1981" "2393" "3492" NA    
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713"
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456"
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119"
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234"
## KUN "957"  "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052"
## LAM "2419" "951"  "692"  "1070" "733"  "1139" "1637" "2844" "3859" "475" 
## MAT "2751" "875"  "893"  "1862" "1894" "912"  "2799" "1363" "2650" "1060"
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265"
## QNC "3424" "528"  "782"  "604"  "882"  "436"  "1565" "2241" "2880" "1047"
## SSK "2894" "402"  "368"  "722"  "678"  "547"  "1595" "2390" "3271" "530" 
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859"  "2874"
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973"  "3147"
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986"  "3562" "3635" "2397"
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722"
## YUN "2806" "1487" "1323" "1035" "601"  "1627" "1108" "3467" "4320" "1179"
##     KAG    KAN    KAT    KLP    KUN    LAM    MAT    OKI    QNC    SSK   
## BEN "0.15" "0.19" "0.11" "0.08" "0.13" "0.03" "0.04" "0.14" "0.10" "0.03"
## CAM "0.12" "0.17" "0.10" "0.05" "0.15" "0.01" "0.02" "0.11" "0.08" "0.01"
## CHA "0.13" "0.18" "0.10" "0.06" "0.15" "0.01" "0.02" "0.12" "0.08" "0.00"
## HAI "0.10" "0.14" "0.13" "0.08" "0.19" "0.05" "0.06" "0.09" "0.12" "0.05"
## HAN "0.12" "0.17" "0.14" "0.09" "0.22" "0.05" "0.06" "0.11" "0.13" "0.05"
## HOC "0.10" "0.15" "0.15" "0.09" "0.21" "0.06" "0.07" "0.10" "0.13" "0.06"
## HUN "0.08" "0.12" "0.15" "0.10" "0.20" "0.07" "0.08" "0.07" "0.14" "0.07"
## INJ "0.22" "0.27" "0.09" "0.17" "0.26" "0.13" "0.14" "0.21" "0.20" "0.13"
## INW "0.26" "0.32" "0.11" "0.24" "0.34" "0.17" "0.18" "0.26" "0.24" "0.16"
## KAC "0.13" "0.18" "0.10" "0.06" "0.17" "0.01" "0.02" "0.12" "0.08" "0.00"
## KAG NA     "0.13" "0.21" "0.17" "0.26" "0.14" "0.14" "0.12" "0.19" "0.13"
## KAN "791"  NA     "0.25" "0.22" "0.32" "0.19" "0.19" "0.17" "0.24" "0.18"
## KAT "4367" "4880" NA     "0.15" "0.23" "0.11" "0.11" "0.20" "0.18" "0.10"
## KLP "4376" "5163" "3242" NA     "0.24" "0.06" "0.06" "0.16" "0.14" "0.06"
## KUN "6631" "7300" "2779" "3196" NA     "0.17" "0.17" "0.26" "0.24" "0.16"
## LAM "3450" "4160" "1789" "1704" "3202" NA     "0.02" "0.12" "0.09" "0.01"
## MAT "4289" "5073" "3069" "177"  "3124" "1531" NA     "0.13" "0.10" "0.02"
## OKI "618"  "1390" "4204" "3822" "6262" "3060" "3747" NA     "0.19" "0.12"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA     "0.08"
## SSK "3232" "3991" "2412" "1365" "3582" "624"  "1217" "2758" "548"  NA    
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873"  "1548" "1886"
## UTS "1023" "288"  "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714"  "2208" "2678" "1449" "1087"
##     SUF    SUU    TAI    UTS    YUN    
## BEN "0.15" "0.11" "0.20" "0.17" "0.02" 
## CAM "0.14" "0.09" "0.17" "0.14" "0.00" 
## CHA "0.14" "0.10" "0.18" "0.15" "0.00" 
## HAI "0.16" "0.09" "0.14" "0.11" "0.05" 
## HAN "0.18" "0.11" "0.18" "0.13" "0.04" 
## HOC "0.17" "0.10" "0.16" "0.12" "0.06" 
## HUN "0.17" "0.10" "0.13" "0.09" "0.07" 
## INJ "0.08" "0.09" "0.27" "0.24" "0.13" 
## INW "0.10" "0.13" "0.34" "0.28" "0.16" 
## KAC "0.15" "0.10" "0.19" "0.15" "-0.00"
## KAG "0.23" "0.17" "0.19" "0.09" "0.13" 
## KAN "0.27" "0.21" "0.23" "0.10" "0.18" 
## KAT "0.06" "0.07" "0.26" "0.22" "0.10" 
## KLP "0.20" "0.14" "0.24" "0.19" "0.05" 
## KUN "0.30" "0.25" "0.34" "0.28" "0.16" 
## LAM "0.15" "0.10" "0.19" "0.16" "0.01" 
## MAT "0.15" "0.11" "0.19" "0.16" "0.02" 
## OKI "0.22" "0.16" "0.17" "0.14" "0.12" 
## QNC "0.22" "0.17" "0.25" "0.21" "0.08" 
## SSK "0.14" "0.10" "0.18" "0.15" "0.00" 
## SUF NA     "0.04" "0.29" "0.24" "0.14" 
## SUU "381"  NA     "0.22" "0.18" "0.10" 
## TAI "2777" "2747" NA     "0.20" "0.18" 
## UTS "4755" "4566" "2418" NA     "0.15" 
## YUN "3566" "3744" "1928" "3902" NA
cities <- readRDS(here("output", "populations", "cities.rds"))
head(cities)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     SSK    QNC    CAM    HOC    CHA    HAI    HAN    KAC    LAM    YUN   
## SSK NA     "0.08" "0.01" "0.06" "0.00" "0.05" "0.05" "0.00" "0.01" "0.00"
## QNC "548"  NA     "0.08" "0.13" "0.08" "0.12" "0.13" "0.08" "0.09" "0.08"
## CAM "402"  "528"  NA     "0.05" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## HOC "547"  "436"  "209"  NA     "0.06" "0.03" "0.05" "0.06" "0.06" "0.06"
## CHA "368"  "782"  "329"  "537"  NA     "0.05" "0.05" "0.00" "0.01" "0.00"
## HAI "722"  "604"  "987"  "991"  "1087" NA     "0.04" "0.05" "0.05" "0.05"
## HAN "678"  "882"  "1059" "1146" "1019" "441"  NA     "0.04" "0.05" "0.04"
## KAC "530"  "1047" "647"  "856"  "320"  "1218" "1028" NA     "0.01" "0.00"
## LAM "624"  "1155" "951"  "1139" "692"  "1070" "733"  "475"  NA     "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601"  "1179" "714"  NA    
## MAT "1217" "1348" "875"  "912"  "893"  "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966"  "954"  "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
##     MAT    HUN    KLP    TAI    OKI    SUF    INJ    SUU    KAT    KAG   
## SSK "0.02" "0.07" "0.06" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## QNC "0.10" "0.14" "0.14" "0.25" "0.19" "0.22" "0.20" "0.17" "0.18" "0.19"
## CAM "0.02" "0.06" "0.05" "0.17" "0.11" "0.14" "0.13" "0.09" "0.10" "0.12"
## HOC "0.07" "0.04" "0.09" "0.16" "0.10" "0.17" "0.17" "0.10" "0.15" "0.10"
## CHA "0.02" "0.07" "0.06" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## HAI "0.06" "0.04" "0.08" "0.14" "0.09" "0.16" "0.15" "0.09" "0.13" "0.10"
## HAN "0.06" "0.05" "0.09" "0.18" "0.11" "0.18" "0.16" "0.11" "0.14" "0.12"
## KAC "0.02" "0.07" "0.06" "0.19" "0.12" "0.15" "0.13" "0.10" "0.10" "0.13"
## LAM "0.02" "0.07" "0.06" "0.19" "0.12" "0.15" "0.13" "0.10" "0.11" "0.14"
## YUN "0.02" "0.07" "0.05" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## MAT NA     "0.08" "0.06" "0.19" "0.13" "0.15" "0.14" "0.11" "0.11" "0.14"
## HUN "2799" NA     "0.10" "0.13" "0.07" "0.17" "0.16" "0.10" "0.15" "0.08"
## KLP "177"  "2932" NA     "0.24" "0.16" "0.20" "0.17" "0.14" "0.15" "0.17"
## TAI "2894" "986"  "2979" NA     "0.17" "0.29" "0.27" "0.22" "0.26" "0.19"
## OKI "3747" "1599" "3822" "873"  NA     "0.22" "0.21" "0.16" "0.20" "0.12"
## SUF "2220" "3406" "2115" "2777" "3280" NA     "0.08" "0.04" "0.06" "0.23"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA     "0.09" "0.09" "0.22"
## SUU "2565" "3468" "2469" "2747" "3154" "381"  "1911" NA     "0.07" "0.17"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA     "0.21"
## KAG "4289" "1860" "4376" "1397" "618"  "3893" "4901" "3756" "4367" NA    
## INW "2650" "4248" "2505" "3635" "4110" "859"  "1528" "973"  "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791" 
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
##     INW    BEN    KAN    UTS    KUN   
## SSK "0.16" "0.03" "0.18" "0.15" "0.16"
## QNC "0.24" "0.10" "0.24" "0.21" "0.24"
## CAM "0.16" "0.03" "0.17" "0.14" "0.15"
## HOC "0.21" "0.08" "0.15" "0.12" "0.21"
## CHA "0.16" "0.02" "0.18" "0.15" "0.15"
## HAI "0.19" "0.07" "0.14" "0.11" "0.19"
## HAN "0.22" "0.07" "0.17" "0.13" "0.22"
## KAC "0.17" "0.02" "0.18" "0.15" "0.17"
## LAM "0.17" "0.03" "0.19" "0.16" "0.17"
## YUN "0.16" "0.02" "0.18" "0.15" "0.16"
## MAT "0.18" "0.04" "0.19" "0.16" "0.17"
## HUN "0.20" "0.09" "0.12" "0.09" "0.20"
## KLP "0.24" "0.08" "0.22" "0.19" "0.24"
## TAI "0.34" "0.20" "0.23" "0.20" "0.34"
## OKI "0.26" "0.14" "0.17" "0.14" "0.26"
## SUF "0.10" "0.15" "0.27" "0.24" "0.30"
## INJ "0.07" "0.14" "0.27" "0.24" "0.26"
## SUU "0.13" "0.11" "0.21" "0.18" "0.25"
## KAT "0.11" "0.11" "0.25" "0.22" "0.23"
## KAG "0.26" "0.15" "0.13" "0.09" "0.26"
## INW NA     "0.17" "0.32" "0.28" "0.34"
## BEN "5347" NA     "0.19" "0.17" "0.13"
## KAN "5424" "6410" NA     "0.10" "0.32"
## UTS "5539" "6696" "288"  NA     "0.28"
## KUN "5513" "957"  "7300" "7583" NA

Make a table and save as word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]


# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))

ft

Population

SSK

QNC

CAM

HOC

CHA

HAI

HAN

KAC

LAM

YUN

MAT

HUN

KLP

TAI

OKI

SUF

INJ

SUU

KAT

KAG

INW

BEN

KAN

UTS

KUN

SSK

0.08

0.01

0.06

0.00

0.05

0.05

0.00

0.01

0.00

0.02

0.07

0.06

0.18

0.12

0.14

0.13

0.10

0.10

0.13

0.16

0.03

0.18

0.15

0.16

QNC

548

0.08

0.13

0.08

0.12

0.13

0.08

0.09

0.08

0.10

0.14

0.14

0.25

0.19

0.22

0.20

0.17

0.18

0.19

0.24

0.10

0.24

0.21

0.24

CAM

402

528

0.05

0.00

0.04

0.04

0.00

0.01

0.00

0.02

0.06

0.05

0.17

0.11

0.14

0.13

0.09

0.10

0.12

0.16

0.03

0.17

0.14

0.15

HOC

547

436

209

0.06

0.03

0.05

0.06

0.06

0.06

0.07

0.04

0.09

0.16

0.10

0.17

0.17

0.10

0.15

0.10

0.21

0.08

0.15

0.12

0.21

CHA

368

782

329

537

0.05

0.05

0.00

0.01

0.00

0.02

0.07

0.06

0.18

0.12

0.14

0.13

0.10

0.10

0.13

0.16

0.02

0.18

0.15

0.15

HAI

722

604

987

991

1087

0.04

0.05

0.05

0.05

0.06

0.04

0.08

0.14

0.09

0.16

0.15

0.09

0.13

0.10

0.19

0.07

0.14

0.11

0.19

HAN

678

882

1059

1146

1019

441

0.04

0.05

0.04

0.06

0.05

0.09

0.18

0.11

0.18

0.16

0.11

0.14

0.12

0.22

0.07

0.17

0.13

0.22

KAC

530

1047

647

856

320

1218

1028

0.01

0.00

0.02

0.07

0.06

0.19

0.12

0.15

0.13

0.10

0.10

0.13

0.17

0.02

0.18

0.15

0.17

LAM

624

1155

951

1139

692

1070

733

475

0.01

0.02

0.07

0.06

0.19

0.12

0.15

0.13

0.10

0.11

0.14

0.17

0.03

0.19

0.16

0.17

YUN

1087

1449

1487

1627

1323

1035

601

1179

714

0.02

0.07

0.05

0.18

0.12

0.14

0.13

0.10

0.10

0.13

0.16

0.02

0.18

0.15

0.16

MAT

1217

1348

875

912

893

1862

1894

1060

1531

2208

0.08

0.06

0.19

0.13

0.15

0.14

0.11

0.11

0.14

0.18

0.04

0.19

0.16

0.17

HUN

1595

1565

1929

1954

1956

966

954

1981

1637

1108

2799

0.10

0.13

0.07

0.17

0.16

0.10

0.15

0.08

0.20

0.09

0.12

0.09

0.20

KLP

1365

1445

1003

1011

1055

1984

2042

1234

1704

2375

177

2932

0.24

0.16

0.20

0.17

0.14

0.15

0.17

0.24

0.08

0.22

0.19

0.24

TAI

1886

1548

2061

1982

2237

1180

1497

2397

2219

1928

2894

986

2979

0.17

0.29

0.27

0.22

0.26

0.19

0.34

0.20

0.23

0.20

0.34

OKI

2758

2410

2929

2839

3110

2047

2328

3265

3060

2678

3747

1599

3822

873

0.22

0.21

0.16

0.20

0.12

0.26

0.14

0.17

0.14

0.26

SUF

2567

2117

2245

2046

2557

2614

2986

2874

3182

3566

2220

3406

2115

2777

3280

0.08

0.04

0.06

0.23

0.10

0.15

0.27

0.24

0.30

INJ

2390

2241

1989

1889

2160

2845

3034

2393

2844

3467

1363

3805

1188

3562

4294

1535

0.09

0.09

0.22

0.07

0.14

0.27

0.24

0.26

SUU

2797

2309

2506

2300

2827

2752

3151

3147

3420

3744

2565

3468

2469

2747

3154

381

1911

0.07

0.17

0.13

0.11

0.21

0.18

0.25

KAT

2412

2922

2724

2921

2424

2648

2207

2119

1789

1640

3069

2611

3242

3537

4204

4967

4429

5209

0.21

0.11

0.11

0.25

0.22

0.23

KAG

3232

2942

3446

3377

3595

2510

2721

3713

3450

2967

4289

1860

4376

1397

618

3893

4901

3756

4367

0.26

0.15

0.13

0.09

0.26

INW

3271

2880

2908

2727

3190

3419

3762

3492

3859

4320

2650

4248

2505

3635

4110

859

1528

973

5611

4718

0.17

0.32

0.28

0.34

BEN

2894

3424

2976

3174

2660

3487

3134

2377

2419

2806

2751

3913

2869

4631

5450

4970

3876

5307

1828

5773

5347

0.19

0.17

0.13

KAN

3991

3725

4223

4161

4357

3270

3444

4456

4160

3619

5073

2532

5163

2185

1390

4621

5685

4452

4880

791

5424

6410

0.10

0.32

UTS

4249

3965

4470

4399

4613

3527

3715

4722

4435

3902

5311

2811

5396

2418

1590

4755

5874

4566

5167

1023

5539

6696

288

0.28

KUN

3582

4070

3572

3749

3288

4241

3934

3052

3202

3681

3124

4777

3196

5414

6262

5293

3986

5658

2779

6631

5513

957

7300

7583

# Create a new Word document
doc <- read_docx()

# Add the flextable to the Word document
doc <- body_add_flextable(doc, ft)

# Save the Word document
print(doc, target =  here("output", "fst", "LD2.docx"))

4. Comparison of the 3 sets

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  6194515 330.9   11233262 600.0         NA 11233262 600.0
## Vcells 10497539  80.1   18029751 137.6      32768 14958126 114.2

Import the data

Intergenic

intergenic <- readRDS(
  here(
    "output", "fst", "neutral_country.rds"
  )
)
head(intergenic)
## # A tibble: 6 × 2
##   Country   mean_fst
##   <chr>        <dbl>
## 1 Cambodia      0.05
## 2 China         0.07
## 3 India         0.07
## 4 Indonesia     0.11
## 5 Japan         0.12
## 6 Malaysia      0.09

LD1

LD1 <- readRDS(
  here(
    "output", "fst", "r2_0.01_country.rds"
  )
)
head(LD1)
## # A tibble: 6 × 2
##   Country   mean_fst
##   <chr>        <dbl>
## 1 Cambodia      0.08
## 2 China         0.09
## 3 India         0.1 
## 4 Indonesia     0.16
## 5 Japan         0.17
## 6 Malaysia      0.11

LD2

LD2 <- readRDS(
  here(
    "output", "fst", "r2_0.1_country.rds"
  )
)
head(LD2)
## # A tibble: 6 × 2
##   Country   mean_fst
##   <chr>        <dbl>
## 1 Cambodia      0.08
## 2 China         0.09
## 3 India         0.1 
## 4 Indonesia     0.17
## 5 Japan         0.17
## 6 Malaysia      0.11

Merge the data sets

# Add an identifier column to each dataset
intergenic$dataset <- "intergenic"
LD1$dataset <- "LD1"
LD2$dataset <- "LD2"

# Merge the datasets
combined_data <- rbind(intergenic, LD1, LD2)

# Assign a numeric index for plotting based on ordered countries
combined_data <- combined_data %>% 
  arrange(Country) %>%
  group_by(Country) %>%
  mutate(index = dense_rank(Country))

# Custom color mapping
color_mapping <- scale_color_manual(
  values = c(intergenic = "gray", LD1 = "blue", LD2 = "orange"),
  name = "Dataset"
)


# Shapes
shape_mapping <- scale_shape_manual(
  values = c(intergenic = 16, LD1 = 17, LD2 = 18),
  name = "Dataset"
)


# Scatter plot with custom colors and shapes
plot <- ggplot(combined_data, aes(x = Country, y = mean_fst, color = dataset, shape = dataset)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE, aes(group = dataset)) +  # Fitted line per dataset
  labs(
    title = "",
    x = "Countries",
    y = "Mean Fst"
  ) +
  color_mapping +
  shape_mapping +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "top")

# Display the plot
print(plot)
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_datasets_country.pdf"),
  plot, 
  width = 7,
  height = 5,
  units = "in"
)
## `geom_smooth()` using formula = 'y ~ x'

5. Mantel test with LD2

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--make-bed \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log

Then make sure to edit the line “BEN a 0 0 0 -9” to “BEN b 0 0 0 -9”

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/mantel \
--recodeA \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log

Import the daat and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("output", "fst", "mantel.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)
saveRDS(albo2, here(
  "output", "fst", "albo2.rds"
))

Load it

albo2 <- readRDS(here(
  "output", "fst", "albo2.rds"
))

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "cities.rds"))
head(sampling_loc)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

The fam file

fam_file <- here(
  "output", "fst", "mantel.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>% 
#   mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      OKI         1001          0          0   2        -9  Okinawa   Japan
## 2      OKI         1002          0          0   2        -9  Okinawa   Japan
## 3      OKI         1003          0          0   2        -9  Okinawa   Japan
## 4      OKI         1004          0          0   2        -9  Okinawa   Japan
## 5      OKI         1005          0          0   2        -9  Okinawa   Japan
## 6      OKI         1006          0          0   1        -9  Okinawa   Japan
##   Latitude Longitude Region
## 1  26.5013  127.9454   Asia
## 2  26.5013  127.9454   Asia
## 3  26.5013  127.9454   Asia
## 4  26.5013  127.9454   Asia
## 5  26.5013  127.9454   Asia
## 6  26.5013  127.9454   Asia

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##         [,1]     [,2]
## [1,] 26.5013 127.9454
## [2,] 26.5013 127.9454
## [3,] 26.5013 127.9454
## [4,] 26.5013 127.9454
## [5,] 26.5013 127.9454
## [6,] 26.5013 127.9454
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 26.51917 127.9537
## [2,] 26.51278 127.9527
## [3,] 26.52068 127.9563
## [4,] 26.52102 127.9391
## [5,] 26.51013 127.9400
## [6,] 26.51191 127.9547

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "output", "fst", "albo2.rds"
  )
)

Isolation by distance

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: 0.225538 
## 
## Based on 999 replicates
## Simulated p-value: 0.074 
## Alternative hypothesis: greater 
## 
##     Std.Obs Expectation    Variance 
## 1.504698485 0.005512092 0.021382012

Explanations: Observation: This is the observed Mantel statistic for your data, which is 0.225605. It represents the correlation between the two matrices. A positive value indicates a positive correlation, meaning as one distance increases, so does the other.

Monte-Carlo Test: This test involves repeatedly randomizing one of the matrices and calculating the Mantel statistic for each randomized version. This process creates a distribution of Mantel statistics under the null hypothesis (which usually states that there is no association between the matrices).

Simulated p-value: The p-value is 0.059. This value represents the proportion of replicates in the Monte-Carlo test where the Mantel statistic was greater than or equal to the observed statistic (0.225605). A p-value close to 0.05 (like ours, 0.059) suggests that the observed correlation might not be due to random chance, but it’s not conventionally ‘significant’ as it’s slightly above the typical cutoff of 0.05.

Alternative Hypothesis: greater: This indicates that the test was one-sided, testing whether the observed Mantel statistic is greater than what would be expected by chance.

Std.Obs (Standardized Observation): This is your observed Mantel statistic standardized (1.626624021) using the expectation and variance from the Monte-Carlo test.

Expectation and Variance: These are the mean (-0.003941264) and variance (0.019914350) of the Mantel statistic under the null hypothesis, calculated from the Monte-Carlo replicates.

Interpretation: The observed Mantel statistic (0.225605) suggests a moderate positive correlation between genetic and geographic distances. The p-value of 0.059 is close to 0.05, indicating a trend towards significance, but not strong enough to confidently reject the null hypothesis of no correlation. The test was specifically looking to see if the genetic distance increases with geographic distance (one-sided test). The standardized observation (Std.Obs) being positive and relatively large suggests that the observed correlation is somewhat higher than what would be expected by chance.

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "fst", "sim.pdf"))
plot(ibd)
dev.off()
## quartz_off_screen 
##                 2
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Add the equation

# Plotting the data
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
# You can adjust the position (e.g., x and y values) as necessary
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.9, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.85, labels = r2_label)

Use library MASS for plot

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
png(here("output", "fst", "ibd2.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.5, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.45, labels = r2_label)

title("Isolation by distance")
dev.off()
## quartz_off_screen 
##                 2

Check the populations - I did not include those with less than 4 mosquitoes

summary(albo2$pop)
## BEN CAM CHA HAI HAN HOC HUN INJ INW KAC KAG KAN KAT KLP KUN LAM MAT OKI QNC SSK 
##  12  12  12  12   4   7  12  11   4   6  12  11  10   4   4   9  12  11  12  12 
## SUF SUU TAI UTS YUN 
##   6   6   8  12   9