Aedes albopictus SNP chip

Libraries
1. Intergenic SNPs
2. SNP set r2 0.01
3. SNP set r2 0.1
4. Comparison of the 3 sets
5. Mantel test with LD2

Libraries

library(StAMPP)
library(ggplot2)
library(tidyverse)
library(adegenet)
library(here)
library(flextable)
library(officer)
library(reshape2)
library(dplyr)
library(tidyr)
library(geosphere)
library(flextable)
library(officer)
library(dartR)
library(MASS)
library(Cairo)

We can use different data sets to run our fst estimates.

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()

##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 5834115 311.6    9855684 526.4         NA  7331362 391.6
## Vcells 9598989  73.3   14832738 113.2      32768 12293949  93.8

1. Intergenic SNPs

We can estimate Fst using only the neutral set of SNPs for populations with at least 4 individuals.

Create list of populations

awk '{print $1}' output/populations/snps_sets/neutral.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4fst.txt;
head  output/fst/pops_4fst.txt;
wc -l output/fst/pops_4fst.txt

## BEN
## CAM
## CHA
## HAI
## HAN
## HOC
## HUN
## INJ
## INW
## KAC
##       25 output/fst/pops_4fst.txt

We have 25 populations with 4 or more mosquitoes. We can convert to raw format and subset the bed file

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/neutral \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/neutral \
--silent;
grep 'samples\|variants\|remaining' output/fst/neutral.log

## 9483 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.96967.
## 9483 variants and 230 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

neutral <-
  read.PLINK(
    here(
      "output", "fst", "neutral.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

summary(neutral)

Now lets convert the genlight object to Stampp format, and estimate pairwide Fst values

The command below would also work, but you can simplify it and put only the numbers: genome_equal_2 <- stamppFst(neutral, nboots=100, percent=95 + nclusters==10)

This chunk will take a couple minutes to run.

# convert
neutral_2 <- stamppConvert(neutral, type="genlight")

# run stampp. If you want to run with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
neutral_3 <- stamppFst(neutral_2, 1, 95, 1)

Save it

saveRDS(
  neutral_3, here(
    "output", "fst", "neutral.rds"
  )
)

To load it

neutral_3 <- readRDS(
  here(
    "output", "fst", "neutral.rds"
  )
)

Now lets look at the object

summary(neutral_3)

##       OKI               HAI               YUN                 KAT         
##  Min.   :0.06693   Min.   :0.02807   Min.   :-0.002161   Min.   :0.06445  
##  1st Qu.:0.08088   1st Qu.:0.03572   1st Qu.: 0.017006   1st Qu.:0.07817  
##  Median :0.09944   Median :0.05998   Median : 0.055814   Median :0.09368  
##  Mean   :0.11276   Mean   :0.06803   Mean   : 0.056186   Mean   :0.11228  
##  3rd Qu.:0.14177   3rd Qu.:0.09611   3rd Qu.: 0.082102   3rd Qu.:0.13943  
##  Max.   :0.21810   Max.   :0.15999   Max.   : 0.143292   Max.   :0.21573  
##  NA's   :1         NA's   :2         NA's   :3           NA's   :4        
##       TAI              HUN               HAN               KLP         
##  Min.   :0.1233   Min.   :0.03590   Min.   :0.03056   Min.   :0.04714  
##  1st Qu.:0.1495   1st Qu.:0.04296   1st Qu.:0.04201   1st Qu.:0.05136  
##  Median :0.1642   Median :0.05908   Median :0.08134   Median :0.10614  
##  Mean   :0.1780   Mean   :0.07034   Mean   :0.08233   Mean   :0.10225  
##  3rd Qu.:0.2007   3rd Qu.:0.09196   3rd Qu.:0.11386   3rd Qu.:0.12868  
##  Max.   :0.3004   Max.   :0.16409   Max.   :0.19157   Max.   :0.21872  
##  NA's   :5        NA's   :6         NA's   :7         NA's   :8        
##       HOC               QNC               SSK                KAC           
##  Min.   :0.03383   Min.   :0.07153   Min.   :0.000295   Min.   :-0.000224  
##  1st Qu.:0.04598   1st Qu.:0.07691   1st Qu.:0.010202   1st Qu.: 0.014289  
##  Median :0.07343   Median :0.13239   Median :0.069284   Median : 0.075309  
##  Mean   :0.08110   Mean   :0.12117   Mean   :0.056040   Mean   : 0.060492  
##  3rd Qu.:0.10792   3rd Qu.:0.14787   3rd Qu.:0.083976   3rd Qu.: 0.085694  
##  Max.   :0.17718   Max.   :0.21538   Max.   :0.134878   Max.   : 0.145295  
##  NA's   :9         NA's   :10        NA's   :11         NA's   :12         
##       CHA                KAN               UTS               CAM          
##  Min.   :0.003151   Min.   :0.09212   Min.   :0.07893   Min.   :0.008014  
##  1st Qu.:0.017775   1st Qu.:0.11198   1st Qu.:0.09267   1st Qu.:0.019739  
##  Median :0.077002   Median :0.12201   Median :0.10527   Median :0.070429  
##  Mean   :0.063989   Mean   :0.14434   Mean   :0.12345   Mean   :0.064043  
##  3rd Qu.:0.087944   3rd Qu.:0.16900   3rd Qu.:0.14159   3rd Qu.:0.081284  
##  Max.   :0.131817   Max.   :0.23897   Max.   :0.21422   Max.   :0.135431  
##  NA's   :13         NA's   :14        NA's   :15        NA's   :16        
##       KAG               BEN               LAM               SUF         
##  Min.   :0.08122   Min.   :0.02333   Min.   :0.01692   Min.   :0.03162  
##  1st Qu.:0.09061   1st Qu.:0.05390   1st Qu.:0.06908   1st Qu.:0.06323  
##  Median :0.12139   Median :0.08997   Median :0.08394   Median :0.06969  
##  Mean   :0.12750   Mean   :0.07831   Mean   :0.08446   Mean   :0.09810  
##  3rd Qu.:0.14741   3rd Qu.:0.10404   3rd Qu.:0.10618   3rd Qu.:0.09388  
##  Max.   :0.20375   Max.   :0.11899   Max.   :0.14422   Max.   :0.23208  
##  NA's   :17        NA's   :18        NA's   :19        NA's   :20       
##       SUU               KUN              INW               INJ         
##  Min.   :0.06603   Min.   :0.1486   Min.   :0.06648   Min.   :0.08835  
##  1st Qu.:0.07088   1st Qu.:0.1799   1st Qu.:0.08016   1st Qu.:0.08835  
##  Median :0.08605   Median :0.2113   Median :0.09384   Median :0.08835  
##  Mean   :0.11206   Mean   :0.2123   Mean   :0.09384   Mean   :0.08835  
##  3rd Qu.:0.12723   3rd Qu.:0.2441   3rd Qu.:0.10751   3rd Qu.:0.08835  
##  Max.   :0.21012   Max.   :0.2769   Max.   :0.12119   Max.   :0.08835  
##  NA's   :21        NA's   :22       NA's   :23        NA's   :24       
##       MAT     
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :25

If you want you can save the fst values as csv.

# Convert to data frame
neutral_df <- data.frame(neutral_3)

# Save it
write.csv(neutral_df, file = here("output", "fst", "neutral_df.csv"))

Check the Fst values

head(neutral_df)

##            OKI        HAI        YUN       KAT       TAI HUN HAN KLP HOC QNC
## OKI         NA         NA         NA        NA        NA  NA  NA  NA  NA  NA
## HAI 0.07268706         NA         NA        NA        NA  NA  NA  NA  NA  NA
## YUN 0.07937617 0.03397451         NA        NA        NA  NA  NA  NA  NA  NA
## KAT 0.14886465 0.10179042 0.07762791        NA        NA  NA  NA  NA  NA  NA
## TAI 0.15614563 0.12689237 0.14329227 0.2157330        NA  NA  NA  NA  NA  NA
## HUN 0.06693096 0.02807325 0.04026851 0.1034478 0.1233325  NA  NA  NA  NA  NA
##     SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## YUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KAT  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

We will convert the data into a matrix.

aa <- as.matrix(neutral_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)

##            OKI        HAI        YUN        KAT       TAI        HUN        HAN
## OKI         NA 0.07268706 0.07937617 0.14886465 0.1561456 0.06693096 0.09294157
## HAI 0.07268706         NA 0.03397451 0.10179042 0.1268924 0.02807325 0.03807541
## YUN 0.07937617 0.03397451         NA 0.07762791 0.1432923 0.04026851 0.03421635
## KAT 0.14886465 0.10179042 0.07762791         NA 0.2157330 0.10344776 0.11088115
## TAI 0.15614563 0.12689237 0.14329227 0.21573302        NA 0.12333248 0.16471928
## HUN 0.06693096 0.02807325 0.04026851 0.10344776 0.1233325         NA 0.04385787
##            KLP        HOC        QNC         SSK          KAC          CHA
## OKI 0.12753263 0.07966691 0.14381991 0.080917127  0.082764633 0.0807573335
## HAI 0.07062727 0.02857376 0.09737992 0.036399112  0.031226075 0.0350480370
## YUN 0.04892423 0.03983731 0.07317901 0.001820884 -0.002160858 0.0007563386
## KAT 0.12941018 0.11252413 0.14504693 0.078173740  0.078066760 0.0766586019
## TAI 0.20155700 0.14637599 0.20875730 0.142121310  0.152625899 0.1397690667
## HUN 0.07440170 0.03618065 0.10249995 0.042059268  0.038117472 0.0413984260
##            KAN        UTS         CAM        KAG        BEN        LAM
## OKI 0.13456942 0.10960357 0.075330223 0.10320709 0.09568234 0.08756987
## HAI 0.09711038 0.07278214 0.030013229 0.06963227 0.04835553 0.03913709
## YUN 0.10828863 0.08297531 0.002435562 0.07597810 0.01756076 0.00724616
## KAT 0.17428707 0.14702796 0.075728451 0.13942968 0.08425904 0.08100140
## TAI 0.19294422 0.17076422 0.136334355 0.16363753 0.15654314 0.15057531
## HUN 0.08888333 0.06669664 0.035903383 0.05908263 0.05334759 0.04549865
##            SUF        SUU       KUN        INW        INJ        MAT
## OKI 0.14108869 0.11063606 0.2181045 0.18145357 0.14398625 0.09257954
## HAI 0.09101011 0.05998414 0.1599904 0.12872924 0.09510255 0.04482997
## YUN 0.08325162 0.06270272 0.1364487 0.10513394 0.07948148 0.01682092
## KAT 0.06444802 0.06708021 0.2034093 0.09367875 0.08599529 0.09162392
## TAI 0.21266171 0.17599097 0.3003592 0.26676013 0.20046117 0.15440513
## HUN 0.09503274 0.06400679 0.1640872 0.13267018 0.09920459 0.05348803

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    Region2, Country
  )

# Check it
head(sampling_loc)

## # A tibble: 6 × 7
##   Pop_City Country Latitude Longitude Region Abbreviation Region2  
##   <chr>    <chr>      <dbl>     <dbl> <chr>  <chr>        <chr>    
## 1 Hainan   China       19.2      110. Asia   HAI          East Asia
## 2 Yunnan   China       24.5      101. Asia   YUN          East Asia
## 3 Hunan    China       27.6      112. Asia   HUN          East Asia
## 4 Okinawa  Japan       26.5      128. Asia   OKI          East Asia
## 5 Sendai   Japan       38.3      141. Asia   SEN          East Asia
## 6 Nagasaki Japan       32.8      130. Asia   NAG          East Asia

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops

##  [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"

Create vector with order of populations

# Extract the populations that appear in neutral_df
populations_in_neutral <- colnames(neutral_df)

# Reorder the populations based on order_pops
poporder <- populations_in_neutral[populations_in_neutral %in% order_pops]

# Print the reordered populations
print(poporder)

##  [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"

Lets check if the matrix is symmetric.

isSymmetric(aa)

## [1] TRUE

Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)

##       Var1          Var2         value        
##  OKI    : 25   OKI    : 25   Min.   :-0.0022  
##  HAI    : 25   HAI    : 25   1st Qu.: 0.0589  
##  YUN    : 25   YUN    : 25   Median : 0.0871  
##  KAT    : 25   KAT    : 25   Mean   : 0.0959  
##  TAI    : 25   TAI    : 25   3rd Qu.: 0.1325  
##  HUN    : 25   HUN    : 25   Max.   : 0.3004  
##  (Other):475   (Other):475   NA's   :325

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

ggsave(
  filename = here("output", "fst", "fst_matrix_neutral.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

By country

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(neutral_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)

##             Cambodia      China      India  Indonesia      Japan   Malaysia
## Cambodia          NA 0.02278406 0.01973887 0.08102356 0.07791002 0.03291285
## China     0.02278406         NA 0.03975463 0.09135918 0.07657845 0.05151535
## India     0.01973887 0.03975463         NA 0.09289674 0.10153368 0.05126178
## Indonesia 0.08102356 0.09135918 0.09289674         NA 0.14968381 0.11478857
## Japan     0.07791002 0.07657845 0.10153368 0.14968381         NA 0.11864271
## Malaysia  0.03291285 0.05151535 0.05126178 0.11478857 0.11864271         NA
## Maldives  0.13543120 0.15350879 0.11898677 0.23259152 0.21875971 0.18365581
## Nepal     0.07572845 0.09657847 0.08425904 0.07780057 0.15122311 0.11051705
## Taiwan    0.13633435 0.12921240 0.15654314 0.21396849 0.16596381 0.17798106
## Thailand  0.00533491 0.02637889 0.02054397 0.08322300 0.08912422 0.03349593
## Vietnam   0.04581641 0.05486669 0.06564601 0.12518159 0.10974240 0.08606174
##            Maldives      Nepal    Taiwan   Thailand    Vietnam
## Cambodia  0.1354312 0.07572845 0.1363344 0.00533491 0.04581641
## China     0.1535088 0.09657847 0.1292124 0.02637889 0.05486669
## India     0.1189868 0.08425904 0.1565431 0.02054397 0.06564601
## Indonesia 0.2325915 0.07780057 0.2139685 0.08322300 0.12518159
## Japan     0.2187597 0.15122311 0.1659638 0.08912422 0.10974240
## Malaysia  0.1836558 0.11051705 0.1779811 0.03349593 0.08606174
## Maldives         NA 0.20340929 0.3003592 0.13905356 0.19471054
## Nepal     0.2034093         NA 0.2157330 0.07847512 0.12281740
## Taiwan    0.3003592 0.21573302        NA 0.14627290 0.17328419
## Thailand  0.1390536 0.07847512 0.1462729         NA 0.05095433
## Vietnam   0.1947105 0.12281740 0.1732842 0.05095433         NA

symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)

##           Cambodia      China      India  Indonesia      Japan   Malaysia
## Cambodia        NA 0.02278406 0.01973887 0.08102356 0.07791002 0.03291285
## China           NA         NA 0.03975463 0.09135918 0.07657845 0.05151535
## India           NA         NA         NA 0.09289674 0.10153368 0.05126178
## Indonesia       NA         NA         NA         NA 0.14968381 0.11478857
## Japan           NA         NA         NA         NA         NA 0.11864271
## Malaysia        NA         NA         NA         NA         NA         NA
## Maldives        NA         NA         NA         NA         NA         NA
## Nepal           NA         NA         NA         NA         NA         NA
## Taiwan          NA         NA         NA         NA         NA         NA
## Thailand        NA         NA         NA         NA         NA         NA
## Vietnam         NA         NA         NA         NA         NA         NA
##            Maldives      Nepal    Taiwan   Thailand    Vietnam
## Cambodia  0.1354312 0.07572845 0.1363344 0.00533491 0.04581641
## China     0.1535088 0.09657847 0.1292124 0.02637889 0.05486669
## India     0.1189868 0.08425904 0.1565431 0.02054397 0.06564601
## Indonesia 0.2325915 0.07780057 0.2139685 0.08322300 0.12518159
## Japan     0.2187597 0.15122311 0.1659638 0.08912422 0.10974240
## Malaysia  0.1836558 0.11051705 0.1779811 0.03349593 0.08606174
## Maldives         NA 0.20340929 0.3003592 0.13905356 0.19471054
## Nepal            NA         NA 0.2157330 0.07847512 0.12281740
## Taiwan           NA         NA        NA 0.14627290 0.17328419
## Thailand         NA         NA        NA         NA 0.05095433
## Vietnam          NA         NA        NA         NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)

##         Var1           Var2        value        
##  Cambodia :11   Cambodia :11   Min.   :0.00533  
##  China    :11   China    :11   1st Qu.:0.07069  
##  India    :11   India    :11   Median :0.10153  
##  Indonesia:11   Indonesia:11   Mean   :0.11113  
##  Japan    :11   Japan    :11   3rd Qu.:0.15045  
##  Malaysia :11   Malaysia :11   Max.   :0.30036  
##  (Other)  :55   (Other)  :55   NA's   :66

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "fst", "fst_matrix_neutral_by_country.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)

## # A tibble: 25 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 OKI     0.113 
##  2 HAI     0.0682
##  3 YUN     0.0562
##  4 KAT     0.112 
##  5 TAI     0.175 
##  6 HUN     0.0708
##  7 HAN     0.0819
##  8 KLP     0.103 
##  9 HOC     0.0777
## 10 QNC     0.123 
## # ℹ 15 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)

# Rename the Region2 column to Region
fst3 <- fst3 |>
  dplyr::rename(Region = Region2)

# check output
head(fst3)

## # A tibble: 6 × 7
##   Pop_City   Country Latitude Longitude Abbreviation Region mean_fst
##   <chr>      <chr>      <dbl>     <dbl> <chr>        <chr>     <dbl>
## 1 Hainan     China       19.2      110. HAI          East     0.0682
## 2 Yunnan     China       24.5      101. YUN          East     0.0562
## 3 Hunan      China       27.6      112. HUN          East     0.0708
## 4 Okinawa    Japan       26.5      128. OKI          East     0.113 
## 5 Kanazawa   Japan       36.6      137. KAN          East     0.136 
## 6 Utsunomiya Japan       36.6      140. UTS          East     0.112

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 8
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0682
##  2 Yunnan      China       24.5      101.  YUN          East     0.0562
##  3 Hunan       China       27.6      112.  HUN          East     0.0708
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.113 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.136 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.112 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.106 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.175 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0689
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.189 
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>

Mean by country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 9
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0682
##  2 Yunnan      China       24.5      101.  YUN          East     0.0562
##  3 Hunan       China       27.6      112.  HUN          East     0.0708
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.113 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.136 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.112 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.106 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.175 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0689
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.189 
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City
  )

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 11
##    City        Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0682
##  2 Yunnan      China       24.5      101.  YUN          East     0.0562
##  3 Hunan       China       27.6      112.  HUN          East     0.0708
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.113 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.136 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.112 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.106 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.175 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0689
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.189 
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## #   Latitude_group <chr>, mean_fst_by_latitude <dbl>

fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)

## # A tibble: 6 × 9
##   Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
##   <chr>                         <dbl> <chr>               <dbl> <chr>  
## 1 Above 30N                      0.12 East                  0.1 Japan  
## 2 Above 30N                      0.12 East                  0.1 Japan  
## 3 Above 30N                      0.12 East                  0.1 Japan  
## 4 Below 30N                      0.09 East                  0.1 China  
## 5 Below 30N                      0.09 East                  0.1 China  
## 6 Below 30N                      0.09 East                  0.1 China  
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## #   Abbreviation <chr>, mean_fst <dbl>

# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values using intergenic SNPs.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table

Table 1. Fst values using intergenic SNPs.
Latitude_group	mean_fst_by_latitude	Region	mean_fst_by_region	Country	mean_fst_by_country	City	Abbreviation	mean_fst
Above 30N	0.12	East	0.10	Japan	0.12	Kagoshima	KAG	0.11
Above 30N	0.12	East	0.10	Japan	0.12	Kanazawa	KAN	0.14
Above 30N	0.12	East	0.10	Japan	0.12	Utsunomiya	UTS	0.11
Below 30N	0.09	East	0.10	China	0.07	Hainan	HAI	0.07
Below 30N	0.09	East	0.10	China	0.07	Hunan	HUN	0.07
Below 30N	0.09	East	0.10	China	0.07	Yunnan	YUN	0.06
Below 30N	0.09	East	0.10	Japan	0.12	Okinawa	OKI	0.11
Below 30N	0.09	East	0.10	Taiwan	0.18	Tainan	TAI	0.18
Below 30N	0.09	South	0.12	India	0.07	Bengaluru	BEN	0.07
Below 30N	0.09	South	0.12	Maldives	0.19	Kunfunadhoo	KUN	0.19
Below 30N	0.09	South	0.12	Nepal	0.11	Kathmandu	KAT	0.11
Below 30N	0.09	Southeast	0.09	Cambodia	0.05	Phnom Penh	CAM	0.05
Below 30N	0.09	Southeast	0.09	Indonesia	0.11	Jakarta	INJ	0.11
Below 30N	0.09	Southeast	0.09	Indonesia	0.11	Sulawesi (Forest)	SUF	0.11
Below 30N	0.09	Southeast	0.09	Indonesia	0.11	Sulawesi (Urban)	SUU	0.09
Below 30N	0.09	Southeast	0.09	Indonesia	0.11	Wainyapu	INW	0.14
Below 30N	0.09	Southeast	0.09	Malaysia	0.09	Kuala Lumpur	KLP	0.10
Below 30N	0.09	Southeast	0.09	Malaysia	0.09	Tambun	MAT	0.07
Below 30N	0.09	Southeast	0.09	Thailand	0.06	Chanthaburi	CHA	0.06
Below 30N	0.09	Southeast	0.09	Thailand	0.06	Kanchanaburi	KAC	0.06
Below 30N	0.09	Southeast	0.09	Thailand	0.06	Lampang	LAM	0.06
Below 30N	0.09	Southeast	0.09	Thailand	0.06	Sisaket	SSK	0.06
Below 30N	0.09	Southeast	0.09	Vietnam	0.09	Hanoi	HAN	0.08
Below 30N	0.09	Southeast	0.09	Vietnam	0.09	Ho Chi Minh City	HOC	0.08
Below 30N	0.09	Southeast	0.09	Vietnam	0.09	Quy Nhon City	QNC	0.12

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "fst", 
  "fst_neutral_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "fst", "neutral_country.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)

##           HAI      YUN       HUN      OKI       KAN       UTS       KAG
## HAI    0.0000 1035.308  965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081    0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN  965.7432 1107.931    0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283    0.000 1390.3468 1589.7383  617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347    0.0000  288.4938  790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738  288.4938    0.0000 1023.2713
##           TAI      BEN      KUN      KAT       CAM      SUF      SUU      INW
## HAI 1180.1356 3487.311 4241.329 2648.016  987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN  985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI  873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
##          INJ      KLP      MAT       SSK      KAC      CHA       LAM       HAN
## HAI 2844.575 1984.417 1861.932  722.0761 1218.270 1086.825 1070.0656  441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303  714.4744  601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029  953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
##          HOC       QNC
## HAI  990.911  604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

data <- data |>
  dplyr::arrange(
    Distance
  )
head(data)

##   Distance FST row_index col_index
## 1        0   0       HAI       HAI
## 2        0   0       YUN       YUN
## 3        0   0       HUN       HUN
## 4        0   0       OKI       OKI
## 5        0   0       KAN       KAN
## 6        0   0       UTS       UTS

Fit linear regression

# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))


# Plot
ggplot(data, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(data$Distance) * 0.85, y = max(data$FST) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Distance (Km)",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic()

## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_all_samples.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

## `geom_smooth()` using formula = 'y ~ x'

Subset by country Select coutries with at least 3 sampling localities

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops

## [1] "China"     "Japan"     "Indonesia" "Thailand"  "Vietnam"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  
  # Perform linear regression
  lm_model <- lm(FST ~ Distance, data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results

## $China
## $China$equation
## [1] "y = -0.00032x + 0.395"
## 
## $China$r2
## [1] "R^2 = 0.87"
## 
## 
## $Japan
## $Japan$equation
## [1] "y = 0.00002x + 0.111"
## 
## $Japan$r2
## [1] "R^2 = 0.02"
## 
## 
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.00002x + 0.105"
## 
## $Indonesia$r2
## [1] "R^2 = 0.11"
## 
## 
## $Thailand
## $Thailand$equation
## [1] "y = 0.00038x + -0.063"
## 
## $Thailand$r2
## [1] "R^2 = 0.46"
## 
## 
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.00002x + 0.108"
## 
## $Vietnam$r2
## [1] "R^2 = 0.07"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate the linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST ~ Distance, data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("y = %.3fx + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("R^2 = %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(data = regression_results, aes(label = paste(equation, r2, sep = "\n"), x = Inf, y = Inf), 
            vjust = 2, hjust = 2, size = 3.5, inherit.aes = FALSE) +
  labs(title = "FST vs Distance by Country",
       x = "Distance",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_countries.pdf"),
  width = 6,
  height = 8,
  units = "in"
)

## `geom_smooth()` using formula = 'y ~ x'

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)

##     BEN    CAM    CHA    HAI    HAN    HOC    HUN    INJ    INW    KAC    
## BEN NA     "0.02" "0.02" "0.05" "0.05" "0.06" "0.05" "0.09" "0.12" "0.02" 
## CAM "2976" NA     "0.00" "0.03" "0.03" "0.03" "0.04" "0.08" "0.11" "0.00" 
## CHA "2660" "329"  NA     "0.04" "0.03" "0.04" "0.04" "0.08" "0.10" "-0.00"
## HAI "3487" "987"  "1087" NA     "0.04" "0.03" "0.03" "0.10" "0.13" "0.03" 
## HAN "3134" "1059" "1019" "441"  NA     "0.04" "0.04" "0.11" "0.16" "0.03" 
## HOC "3174" "209"  "537"  "991"  "1146" NA     "0.04" "0.11" "0.15" "0.04" 
## HUN "3913" "1929" "1956" "966"  "954"  "1954" NA     "0.10" "0.13" "0.04" 
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA     "0.07" "0.08" 
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA     "0.11" 
## KAC "2377" "647"  "320"  "1218" "1028" "856"  "1981" "2393" "3492" NA     
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713" 
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456" 
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119" 
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234" 
## KUN "957"  "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052" 
## LAM "2419" "951"  "692"  "1070" "733"  "1139" "1637" "2844" "3859" "475"  
## MAT "2751" "875"  "893"  "1862" "1894" "912"  "2799" "1363" "2650" "1060" 
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265" 
## QNC "3424" "528"  "782"  "604"  "882"  "436"  "1565" "2241" "2880" "1047" 
## SSK "2894" "402"  "368"  "722"  "678"  "547"  "1595" "2390" "3271" "530"  
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859"  "2874" 
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973"  "3147" 
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986"  "3562" "3635" "2397" 
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722" 
## YUN "2806" "1487" "1323" "1035" "601"  "1627" "1108" "3467" "4320" "1179" 
##     KAG    KAN    KAT    KLP    KUN    LAM    MAT    OKI    QNC    SSK   
## BEN "0.09" "0.12" "0.08" "0.07" "0.12" "0.02" "0.03" "0.10" "0.09" "0.02"
## CAM "0.07" "0.10" "0.08" "0.05" "0.14" "0.01" "0.02" "0.08" "0.07" "0.00"
## CHA "0.08" "0.11" "0.08" "0.05" "0.13" "0.01" "0.02" "0.08" "0.07" "0.00"
## HAI "0.07" "0.10" "0.10" "0.07" "0.16" "0.04" "0.04" "0.07" "0.10" "0.04"
## HAN "0.09" "0.12" "0.11" "0.08" "0.19" "0.04" "0.05" "0.09" "0.12" "0.04"
## HOC "0.08" "0.11" "0.11" "0.08" "0.18" "0.05" "0.05" "0.08" "0.11" "0.04"
## HUN "0.06" "0.09" "0.10" "0.07" "0.16" "0.05" "0.05" "0.07" "0.10" "0.04"
## INJ "0.14" "0.17" "0.09" "0.12" "0.21" "0.08" "0.09" "0.14" "0.14" "0.08"
## INW "0.17" "0.21" "0.09" "0.18" "0.28" "0.11" "0.12" "0.18" "0.18" "0.11"
## KAC "0.08" "0.11" "0.08" "0.05" "0.15" "0.00" "0.01" "0.08" "0.07" "0.00"
## KAG NA     "0.11" "0.14" "0.12" "0.20" "0.08" "0.09" "0.10" "0.14" "0.08"
## KAN "791"  NA     "0.17" "0.15" "0.24" "0.11" "0.12" "0.13" "0.17" "0.11"
## KAT "4367" "4880" NA     "0.13" "0.20" "0.08" "0.09" "0.15" "0.15" "0.08"
## KLP "4376" "5163" "3242" NA     "0.22" "0.05" "0.06" "0.13" "0.13" "0.05"
## KUN "6631" "7300" "2779" "3196" NA     "0.14" "0.15" "0.22" "0.22" "0.13"
## LAM "3450" "4160" "1789" "1704" "3202" NA     "0.02" "0.09" "0.08" "0.01"
## MAT "4289" "5073" "3069" "177"  "3124" "1531" NA     "0.09" "0.09" "0.02"
## OKI "618"  "1390" "4204" "3822" "6262" "3060" "3747" NA     "0.14" "0.08"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA     "0.07"
## SSK "3232" "3991" "2412" "1365" "3582" "624"  "1217" "2758" "548"  NA    
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873"  "1548" "1886"
## UTS "1023" "288"  "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714"  "2208" "2678" "1449" "1087"
##     SUF    SUU    TAI    UTS    YUN    
## BEN "0.09" "0.07" "0.16" "0.10" "0.02" 
## CAM "0.08" "0.06" "0.14" "0.08" "0.00" 
## CHA "0.08" "0.06" "0.14" "0.08" "0.00" 
## HAI "0.09" "0.06" "0.13" "0.07" "0.03" 
## HAN "0.12" "0.08" "0.16" "0.10" "0.03" 
## HOC "0.11" "0.07" "0.15" "0.08" "0.04" 
## HUN "0.10" "0.06" "0.12" "0.07" "0.04" 
## INJ "0.06" "0.07" "0.20" "0.14" "0.08" 
## INW "0.07" "0.10" "0.27" "0.18" "0.11" 
## KAC "0.08" "0.06" "0.15" "0.09" "-0.00"
## KAG "0.14" "0.11" "0.16" "0.08" "0.08" 
## KAN "0.17" "0.14" "0.19" "0.09" "0.11" 
## KAT "0.06" "0.07" "0.22" "0.15" "0.08" 
## KLP "0.14" "0.11" "0.20" "0.13" "0.05" 
## KUN "0.23" "0.21" "0.30" "0.21" "0.14" 
## LAM "0.09" "0.07" "0.15" "0.09" "0.01" 
## MAT "0.09" "0.07" "0.15" "0.10" "0.02" 
## OKI "0.14" "0.11" "0.16" "0.11" "0.08" 
## QNC "0.15" "0.13" "0.21" "0.15" "0.07" 
## SSK "0.08" "0.06" "0.14" "0.08" "0.00" 
## SUF NA     "0.03" "0.21" "0.14" "0.08" 
## SUU "381"  NA     "0.18" "0.11" "0.06" 
## TAI "2777" "2747" NA     "0.17" "0.14" 
## UTS "4755" "4566" "2418" NA     "0.08" 
## YUN "3566" "3744" "1928" "3902" NA

cities <- readRDS(here("output", "populations", "cities.rds"))
head(cities)

## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)

##     SSK    QNC    CAM    HOC    CHA    HAI    HAN    KAC    LAM    YUN   
## SSK NA     "0.07" "0.00" "0.04" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## QNC "548"  NA     "0.07" "0.11" "0.07" "0.10" "0.12" "0.07" "0.08" "0.07"
## CAM "402"  "528"  NA     "0.03" "0.00" "0.03" "0.03" "0.00" "0.01" "0.00"
## HOC "547"  "436"  "209"  NA     "0.04" "0.03" "0.04" "0.04" "0.05" "0.04"
## CHA "368"  "782"  "329"  "537"  NA     "0.04" "0.03" "0.00" "0.01" "0.00"
## HAI "722"  "604"  "987"  "991"  "1087" NA     "0.04" "0.03" "0.04" "0.03"
## HAN "678"  "882"  "1059" "1146" "1019" "441"  NA     "0.03" "0.04" "0.03"
## KAC "530"  "1047" "647"  "856"  "320"  "1218" "1028" NA     "0.00" "0.00"
## LAM "624"  "1155" "951"  "1139" "692"  "1070" "733"  "475"  NA     "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601"  "1179" "714"  NA    
## MAT "1217" "1348" "875"  "912"  "893"  "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966"  "954"  "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
##     MAT    HUN    KLP    TAI    OKI    SUF    INJ    SUU    KAT    KAG   
## SSK "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## QNC "0.09" "0.10" "0.13" "0.21" "0.14" "0.15" "0.14" "0.13" "0.15" "0.14"
## CAM "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.07"
## HOC "0.05" "0.04" "0.08" "0.15" "0.08" "0.11" "0.11" "0.07" "0.11" "0.08"
## CHA "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## HAI "0.04" "0.03" "0.07" "0.13" "0.07" "0.09" "0.10" "0.06" "0.10" "0.07"
## HAN "0.05" "0.04" "0.08" "0.16" "0.09" "0.12" "0.11" "0.08" "0.11" "0.09"
## KAC "0.01" "0.04" "0.05" "0.15" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## LAM "0.02" "0.05" "0.05" "0.15" "0.09" "0.09" "0.08" "0.07" "0.08" "0.08"
## YUN "0.02" "0.04" "0.05" "0.14" "0.08" "0.08" "0.08" "0.06" "0.08" "0.08"
## MAT NA     "0.05" "0.06" "0.15" "0.09" "0.09" "0.09" "0.07" "0.09" "0.09"
## HUN "2799" NA     "0.07" "0.12" "0.07" "0.10" "0.10" "0.06" "0.10" "0.06"
## KLP "177"  "2932" NA     "0.20" "0.13" "0.14" "0.12" "0.11" "0.13" "0.12"
## TAI "2894" "986"  "2979" NA     "0.16" "0.21" "0.20" "0.18" "0.22" "0.16"
## OKI "3747" "1599" "3822" "873"  NA     "0.14" "0.14" "0.11" "0.15" "0.10"
## SUF "2220" "3406" "2115" "2777" "3280" NA     "0.06" "0.03" "0.06" "0.14"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA     "0.07" "0.09" "0.14"
## SUU "2565" "3468" "2469" "2747" "3154" "381"  "1911" NA     "0.07" "0.11"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA     "0.14"
## KAG "4289" "1860" "4376" "1397" "618"  "3893" "4901" "3756" "4367" NA    
## INW "2650" "4248" "2505" "3635" "4110" "859"  "1528" "973"  "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791" 
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
##     INW    BEN    KAN    UTS    KUN   
## SSK "0.11" "0.02" "0.11" "0.08" "0.13"
## QNC "0.18" "0.09" "0.17" "0.15" "0.22"
## CAM "0.11" "0.02" "0.10" "0.08" "0.14"
## HOC "0.15" "0.06" "0.11" "0.08" "0.18"
## CHA "0.10" "0.02" "0.11" "0.08" "0.13"
## HAI "0.13" "0.05" "0.10" "0.07" "0.16"
## HAN "0.16" "0.05" "0.12" "0.10" "0.19"
## KAC "0.11" "0.02" "0.11" "0.09" "0.15"
## LAM "0.11" "0.02" "0.11" "0.09" "0.14"
## YUN "0.11" "0.02" "0.11" "0.08" "0.14"
## MAT "0.12" "0.03" "0.12" "0.10" "0.15"
## HUN "0.13" "0.05" "0.09" "0.07" "0.16"
## KLP "0.18" "0.07" "0.15" "0.13" "0.22"
## TAI "0.27" "0.16" "0.19" "0.17" "0.30"
## OKI "0.18" "0.10" "0.13" "0.11" "0.22"
## SUF "0.07" "0.09" "0.17" "0.14" "0.23"
## INJ "0.07" "0.09" "0.17" "0.14" "0.21"
## SUU "0.10" "0.07" "0.14" "0.11" "0.21"
## KAT "0.09" "0.08" "0.17" "0.15" "0.20"
## KAG "0.17" "0.09" "0.11" "0.08" "0.20"
## INW NA     "0.12" "0.21" "0.18" "0.28"
## BEN "5347" NA     "0.12" "0.10" "0.12"
## KAN "5424" "6410" NA     "0.09" "0.24"
## UTS "5539" "6696" "288"  NA     "0.21"
## KUN "5513" "957"  "7300" "7583" NA

Make a table and save as word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]


# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))

ft

Population	SSK	QNC	CAM	HOC	CHA	HAI	HAN	KAC	LAM	YUN	MAT	HUN	KLP	TAI	OKI	SUF	INJ	SUU	KAT	KAG	INW	BEN	KAN	UTS	KUN
SSK		0.07	0.00	0.04	0.00	0.04	0.04	0.00	0.01	0.00	0.02	0.04	0.05	0.14	0.08	0.08	0.08	0.06	0.08	0.08	0.11	0.02	0.11	0.08	0.13
QNC	548		0.07	0.11	0.07	0.10	0.12	0.07	0.08	0.07	0.09	0.10	0.13	0.21	0.14	0.15	0.14	0.13	0.15	0.14	0.18	0.09	0.17	0.15	0.22
CAM	402	528		0.03	0.00	0.03	0.03	0.00	0.01	0.00	0.02	0.04	0.05	0.14	0.08	0.08	0.08	0.06	0.08	0.07	0.11	0.02	0.10	0.08	0.14
HOC	547	436	209		0.04	0.03	0.04	0.04	0.05	0.04	0.05	0.04	0.08	0.15	0.08	0.11	0.11	0.07	0.11	0.08	0.15	0.06	0.11	0.08	0.18
CHA	368	782	329	537		0.04	0.03	0.00	0.01	0.00	0.02	0.04	0.05	0.14	0.08	0.08	0.08	0.06	0.08	0.08	0.10	0.02	0.11	0.08	0.13
HAI	722	604	987	991	1087		0.04	0.03	0.04	0.03	0.04	0.03	0.07	0.13	0.07	0.09	0.10	0.06	0.10	0.07	0.13	0.05	0.10	0.07	0.16
HAN	678	882	1059	1146	1019	441		0.03	0.04	0.03	0.05	0.04	0.08	0.16	0.09	0.12	0.11	0.08	0.11	0.09	0.16	0.05	0.12	0.10	0.19
KAC	530	1047	647	856	320	1218	1028		0.00	0.00	0.01	0.04	0.05	0.15	0.08	0.08	0.08	0.06	0.08	0.08	0.11	0.02	0.11	0.09	0.15
LAM	624	1155	951	1139	692	1070	733	475		0.01	0.02	0.05	0.05	0.15	0.09	0.09	0.08	0.07	0.08	0.08	0.11	0.02	0.11	0.09	0.14
YUN	1087	1449	1487	1627	1323	1035	601	1179	714		0.02	0.04	0.05	0.14	0.08	0.08	0.08	0.06	0.08	0.08	0.11	0.02	0.11	0.08	0.14
MAT	1217	1348	875	912	893	1862	1894	1060	1531	2208		0.05	0.06	0.15	0.09	0.09	0.09	0.07	0.09	0.09	0.12	0.03	0.12	0.10	0.15
HUN	1595	1565	1929	1954	1956	966	954	1981	1637	1108	2799		0.07	0.12	0.07	0.10	0.10	0.06	0.10	0.06	0.13	0.05	0.09	0.07	0.16
KLP	1365	1445	1003	1011	1055	1984	2042	1234	1704	2375	177	2932		0.20	0.13	0.14	0.12	0.11	0.13	0.12	0.18	0.07	0.15	0.13	0.22
TAI	1886	1548	2061	1982	2237	1180	1497	2397	2219	1928	2894	986	2979		0.16	0.21	0.20	0.18	0.22	0.16	0.27	0.16	0.19	0.17	0.30
OKI	2758	2410	2929	2839	3110	2047	2328	3265	3060	2678	3747	1599	3822	873		0.14	0.14	0.11	0.15	0.10	0.18	0.10	0.13	0.11	0.22
SUF	2567	2117	2245	2046	2557	2614	2986	2874	3182	3566	2220	3406	2115	2777	3280		0.06	0.03	0.06	0.14	0.07	0.09	0.17	0.14	0.23
INJ	2390	2241	1989	1889	2160	2845	3034	2393	2844	3467	1363	3805	1188	3562	4294	1535		0.07	0.09	0.14	0.07	0.09	0.17	0.14	0.21
SUU	2797	2309	2506	2300	2827	2752	3151	3147	3420	3744	2565	3468	2469	2747	3154	381	1911		0.07	0.11	0.10	0.07	0.14	0.11	0.21
KAT	2412	2922	2724	2921	2424	2648	2207	2119	1789	1640	3069	2611	3242	3537	4204	4967	4429	5209		0.14	0.09	0.08	0.17	0.15	0.20
KAG	3232	2942	3446	3377	3595	2510	2721	3713	3450	2967	4289	1860	4376	1397	618	3893	4901	3756	4367		0.17	0.09	0.11	0.08	0.20
INW	3271	2880	2908	2727	3190	3419	3762	3492	3859	4320	2650	4248	2505	3635	4110	859	1528	973	5611	4718		0.12	0.21	0.18	0.28
BEN	2894	3424	2976	3174	2660	3487	3134	2377	2419	2806	2751	3913	2869	4631	5450	4970	3876	5307	1828	5773	5347		0.12	0.10	0.12
KAN	3991	3725	4223	4161	4357	3270	3444	4456	4160	3619	5073	2532	5163	2185	1390	4621	5685	4452	4880	791	5424	6410		0.09	0.24
UTS	4249	3965	4470	4399	4613	3527	3715	4722	4435	3902	5311	2811	5396	2418	1590	4755	5874	4566	5167	1023	5539	6696	288		0.21
KUN	3582	4070	3572	3749	3288	4241	3934	3052	3202	3681	3124	4777	3196	5414	6262	5293	3986	5658	2779	6631	5513	957	7300	7583

# Create a new Word document
doc <- read_docx()

# Add the flextable to the Word document
doc <- body_add_flextable(doc, ft)

# Save the Word document
print(doc, target =  here("output", "fst", "intergenic.docx"))

2. SNP set r2 0.01

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()

##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  6247934 333.7   11285547 602.8         NA 10548578 563.4
## Vcells 10497484  80.1   18053012 137.8      32768 14977146 114.3

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.01 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.01 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.01.log

## 21095 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.969806.
## 21095 variants and 230 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

r2_0.01 <-
  read.PLINK(
    here(
      "output", "fst", "r2_0.01.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

summary(r2_0.01)

This chunk will take a couple minutes to run.

# convert
r2_0.01_2 <- stamppConvert(r2_0.01, type="genlight")

# run stampp. If you want to runn with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
r2_0.01_3 <- stamppFst(r2_0.01_2, 1, 95, 1)

Save it

saveRDS(
  r2_0.01_3, here(
    "output", "fst", "r2_0.01.rds"
  )
)

To load it

r2_0.01_3 <- readRDS(
  here(
    "output", "fst", "r2_0.01.rds"
  )
)

Now lets look at the object

summary(r2_0.01_3)

##       OKI               HAI               YUN                  KAT         
##  Min.   :0.07411   Min.   :0.03305   Min.   :-0.0000684   Min.   :0.05983  
##  1st Qu.:0.12457   1st Qu.:0.05357   1st Qu.: 0.0202758   1st Qu.:0.09938  
##  Median :0.13500   Median :0.08638   Median : 0.0789742   Median :0.11057  
##  Mean   :0.15247   Mean   :0.09338   Mean   : 0.0821429   Mean   :0.14004  
##  3rd Qu.:0.17245   3rd Qu.:0.13393   3rd Qu.: 0.1383007   3rd Qu.:0.17033  
##  Max.   :0.27404   Max.   :0.19267   Max.   : 0.1838969   Max.   :0.25614  
##  NA's   :1         NA's   :2         NA's   :3            NA's   :4        
##       TAI              HUN               HAN               KLP         
##  Min.   :0.1293   Min.   :0.04315   Min.   :0.03769   Min.   :0.05066  
##  1st Qu.:0.1818   1st Qu.:0.07836   1st Qu.:0.05082   1st Qu.:0.06001  
##  Median :0.1963   Median :0.08788   Median :0.09712   Median :0.13557  
##  Mean   :0.2157   Mean   :0.10679   Mean   :0.10658   Mean   :0.13089  
##  3rd Qu.:0.2423   3rd Qu.:0.12706   3rd Qu.:0.15227   3rd Qu.:0.19086  
##  Max.   :0.3508   Max.   :0.21464   Max.   :0.22468   Max.   :0.23994  
##  NA's   :5        NA's   :6         NA's   :7         NA's   :8        
##       HOC               QNC               SSK                KAC          
##  Min.   :0.04799   Min.   :0.07995   Min.   :0.003314   Min.   :0.001008  
##  1st Qu.:0.06683   1st Qu.:0.08570   1st Qu.:0.011973   1st Qu.:0.018685  
##  Median :0.10139   Median :0.17159   Median :0.110002   Median :0.127829  
##  Mean   :0.11328   Mean   :0.15440   Mean   :0.086747   Mean   :0.096343  
##  3rd Qu.:0.15323   3rd Qu.:0.21452   3rd Qu.:0.148341   3rd Qu.:0.161847  
##  Max.   :0.21621   Max.   :0.24560   Max.   :0.182914   Max.   :0.191394  
##  NA's   :9         NA's   :10        NA's   :11         NA's   :12        
##       CHA                KAN              UTS               CAM          
##  Min.   :0.003687   Min.   :0.1015   Min.   :0.08843   Min.   :0.008604  
##  1st Qu.:0.023862   1st Qu.:0.1789   1st Qu.:0.16593   1st Qu.:0.026054  
##  Median :0.125757   Median :0.1975   Median :0.17580   Median :0.118910  
##  Mean   :0.097930   Mean   :0.2137   Mean   :0.19727   Mean   :0.091140  
##  3rd Qu.:0.149282   3rd Qu.:0.2658   3rd Qu.:0.23861   3rd Qu.:0.127924  
##  Max.   :0.179347   Max.   :0.3277   Max.   :0.29881   Max.   :0.153901  
##  NA's   :13         NA's   :14       NA's   :15        NA's   :16        
##       KAG              BEN               LAM               SUF         
##  Min.   :0.1453   Min.   :0.02933   Min.   :0.01885   Min.   :0.03927  
##  1st Qu.:0.1535   1st Qu.:0.07534   1st Qu.:0.10804   1st Qu.:0.08085  
##  Median :0.1918   Median :0.13014   Median :0.13373   Median :0.09416  
##  Mean   :0.1996   Mean   :0.10628   Mean   :0.11948   Mean   :0.12977  
##  3rd Qu.:0.2348   3rd Qu.:0.13699   3rd Qu.:0.15615   3rd Qu.:0.14373  
##  Max.   :0.2757   Max.   :0.15984   Max.   :0.16749   Max.   :0.29085  
##  NA's   :17       NA's   :18        NA's   :19        NA's   :20       
##       SUU               KUN              INW               INJ        
##  Min.   :0.08632   Min.   :0.1695   Min.   :0.06375   Min.   :0.1298  
##  1st Qu.:0.09984   1st Qu.:0.2147   1st Qu.:0.08902   1st Qu.:0.1298  
##  Median :0.11444   Median :0.2599   Median :0.11429   Median :0.1298  
##  Mean   :0.14125   Mean   :0.2548   Mean   :0.11429   Mean   :0.1298  
##  3rd Qu.:0.15586   3rd Qu.:0.2974   3rd Qu.:0.13955   3rd Qu.:0.1298  
##  Max.   :0.24981   Max.   :0.3349   Max.   :0.16482   Max.   :0.1298  
##  NA's   :21        NA's   :22       NA's   :23        NA's   :24      
##       MAT     
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :25

If you want you can save the fst values as csv.

# Convert to data frame
r2_0.01_df <- data.frame(r2_0.01_3)

# Save it
write.csv(r2_0.01_df, file = here("output", "fst", "r2_0.01_df.csv"))

Check the Fst values

head(r2_0.01_df)

##            OKI        HAI        YUN       KAT       TAI HUN HAN KLP HOC QNC
## OKI         NA         NA         NA        NA        NA  NA  NA  NA  NA  NA
## HAI 0.08801929         NA         NA        NA        NA  NA  NA  NA  NA  NA
## YUN 0.12717041 0.05389545         NA        NA        NA  NA  NA  NA  NA  NA
## KAT 0.19706597 0.13069201 0.09803865        NA        NA  NA  NA  NA  NA  NA
## TAI 0.16437593 0.13725088 0.18359315 0.2561412        NA  NA  NA  NA  NA  NA
## HUN 0.07411078 0.03611331 0.07708983 0.1485081 0.1292503  NA  NA  NA  NA  NA
##     SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## YUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KAT  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Now lets get the Fst values from the object albo3. It has the bootstraps, CI limits, p-values, and Fst values. We will convert the data into a matrix.

aa <- as.matrix(r2_0.01_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)

##            OKI        HAI        YUN        KAT       TAI        HUN        HAN
## OKI         NA 0.08801929 0.12717041 0.19706597 0.1643759 0.07411078 0.11064999
## HAI 0.08801929         NA 0.05389545 0.13069201 0.1372509 0.03611331 0.03832073
## YUN 0.12717041 0.05389545         NA 0.09803865 0.1835931 0.07708983 0.04600133
## KAT 0.19706597 0.13069201 0.09803865         NA 0.2561412 0.14850811 0.13423035
## TAI 0.16437593 0.13725088 0.18359315 0.25614121        NA 0.12925031 0.17880081
## HUN 0.07411078 0.03611331 0.07708983 0.14850811 0.1292503         NA 0.05046773
##            KLP        HOC        QNC        SSK           KAC         CHA
## OKI 0.16656985 0.09549009 0.19007520 0.12817076  1.311262e-01 0.125744140
## HAI 0.08638337 0.03305455 0.12288092 0.05630853  5.254290e-02 0.053245910
## YUN 0.05670882 0.06185930 0.08085858 0.00366398 -6.838534e-05 0.001604481
## KAT 0.14708471 0.14244369 0.17033112 0.09937748  1.006626e-01 0.097234843
## TAI 0.23998253 0.15425313 0.24922008 0.18283845  1.967321e-01 0.178141681
## HUN 0.10841416 0.04314995 0.14309475 0.08054483  7.900004e-02 0.077728277
##           KAN        UTS         CAM        KAG        BEN         LAM
## OKI 0.1634040 0.13493024 0.114640596 0.12106263 0.14616521 0.132969333
## HAI 0.1371587 0.11075509 0.043819388 0.09793452 0.07341023 0.058877477
## YUN 0.1838969 0.15870552 0.003279041 0.13988820 0.02448527 0.006938259
## KAT 0.2486329 0.22282018 0.095547956 0.20548615 0.10205812 0.103613929
## TAI 0.2220346 0.19580993 0.169327020 0.18371222 0.19967862 0.191790397
## HUN 0.1110223 0.08527642 0.066905755 0.07356335 0.09705144 0.084689086
##            SUF        SUU       KUN       INW        INJ        MAT
## OKI 0.21643883 0.15398736 0.2740437 0.2564655 0.21162371 0.13506645
## HAI 0.14879059 0.08892088 0.1926687 0.1850661 0.14633947 0.06332853
## YUN 0.13353818 0.09595000 0.1591835 0.1508741 0.12218318 0.01887260
## KAT 0.05982659 0.06964651 0.2290544 0.1035899 0.09395829 0.11057327
## TAI 0.28326958 0.21716118 0.3507787 0.3354167 0.26762867 0.18747764
## HUN 0.16381699 0.10041490 0.2146355 0.2000360 0.16129956 0.08787756

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    Region2, Country
  )

# Check it
head(sampling_loc)

## # A tibble: 6 × 7
##   Pop_City Country Latitude Longitude Region Abbreviation Region2  
##   <chr>    <chr>      <dbl>     <dbl> <chr>  <chr>        <chr>    
## 1 Hainan   China       19.2      110. Asia   HAI          East Asia
## 2 Yunnan   China       24.5      101. Asia   YUN          East Asia
## 3 Hunan    China       27.6      112. Asia   HUN          East Asia
## 4 Okinawa  Japan       26.5      128. Asia   OKI          East Asia
## 5 Sendai   Japan       38.3      141. Asia   SEN          East Asia
## 6 Nagasaki Japan       32.8      130. Asia   NAG          East Asia

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops

##  [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"

Create vector with order of populations

# Extract the populations that appear in neutral_df
populations_in_r2_0.01 <- colnames(r2_0.01_df)

# Reorder the populations based on order_pops
poporder <- populations_in_r2_0.01[populations_in_r2_0.01 %in% order_pops]

# Print the reordered populations
print(poporder)

##  [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"

Lets check if the matrix is symmetric.

isSymmetric(aa)

## [1] TRUE

Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with the ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)

##       Var1          Var2         value        
##  OKI    : 25   OKI    : 25   Min.   :-0.0001  
##  HAI    : 25   HAI    : 25   1st Qu.: 0.0797  
##  YUN    : 25   YUN    : 25   Median : 0.1309  
##  KAT    : 25   KAT    : 25   Mean   : 0.1312  
##  TAI    : 25   TAI    : 25   3rd Qu.: 0.1735  
##  HUN    : 25   HUN    : 25   Max.   : 0.3508  
##  (Other):475   (Other):475   NA's   :325

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.01.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

By country

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(r2_0.01_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)

##              Cambodia      China      India  Indonesia     Japan   Malaysia
## Cambodia           NA 0.03800139 0.02605433 0.12095720 0.1346749 0.03574590
## China     0.038001395         NA 0.06498231 0.14143583 0.1092279 0.07026417
## India     0.026054327 0.06498231         NA 0.13584651 0.1678509 0.05980367
## Indonesia 0.120957203 0.14143583 0.13584651         NA 0.2313825 0.15803996
## Japan     0.134674917 0.10922792 0.16785086 0.23138254        NA 0.18141023
## Malaysia  0.035745900 0.07026417 0.05980367 0.15803996 0.1814102         NA
## Maldives  0.153900858 0.18882922 0.13014258 0.28386514 0.2940620 0.20471972
## Nepal     0.095547956 0.13143672 0.10205812 0.08175532 0.2113562 0.12882899
## Taiwan    0.169327020 0.14483616 0.19967862 0.27586902 0.1824474 0.21373008
## Thailand  0.006331097 0.04625628 0.02756166 0.12861509 0.1536568 0.03828791
## Vietnam   0.055359685 0.06885421 0.08411290 0.17209901 0.1477431 0.09951772
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1539009 0.09554796 0.1693270 0.006331097 0.05535968
## China     0.1888292 0.13143672 0.1448362 0.046256283 0.06885421
## India     0.1301426 0.10205812 0.1996786 0.027561664 0.08411290
## Indonesia 0.2838651 0.08175532 0.2758690 0.128615090 0.17209901
## Japan     0.2940620 0.21135619 0.1824474 0.153656777 0.14774309
## Malaysia  0.2047197 0.12882899 0.2137301 0.038287915 0.09951772
## Maldives         NA 0.22905444 0.3507787 0.162330775 0.22600173
## Nepal     0.2290544         NA 0.2561412 0.100222216 0.14900172
## Taiwan    0.3507787 0.25614121        NA 0.187375652 0.19409134
## Thailand  0.1623308 0.10022222 0.1873757          NA 0.06532773
## Vietnam   0.2260017 0.14900172 0.1940913 0.065327725         NA

symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)

##           Cambodia      China      India Indonesia     Japan   Malaysia
## Cambodia        NA 0.03800139 0.02605433 0.1209572 0.1346749 0.03574590
## China           NA         NA 0.06498231 0.1414358 0.1092279 0.07026417
## India           NA         NA         NA 0.1358465 0.1678509 0.05980367
## Indonesia       NA         NA         NA        NA 0.2313825 0.15803996
## Japan           NA         NA         NA        NA        NA 0.18141023
## Malaysia        NA         NA         NA        NA        NA         NA
## Maldives        NA         NA         NA        NA        NA         NA
## Nepal           NA         NA         NA        NA        NA         NA
## Taiwan          NA         NA         NA        NA        NA         NA
## Thailand        NA         NA         NA        NA        NA         NA
## Vietnam         NA         NA         NA        NA        NA         NA
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1539009 0.09554796 0.1693270 0.006331097 0.05535968
## China     0.1888292 0.13143672 0.1448362 0.046256283 0.06885421
## India     0.1301426 0.10205812 0.1996786 0.027561664 0.08411290
## Indonesia 0.2838651 0.08175532 0.2758690 0.128615090 0.17209901
## Japan     0.2940620 0.21135619 0.1824474 0.153656777 0.14774309
## Malaysia  0.2047197 0.12882899 0.2137301 0.038287915 0.09951772
## Maldives         NA 0.22905444 0.3507787 0.162330775 0.22600173
## Nepal            NA         NA 0.2561412 0.100222216 0.14900172
## Taiwan           NA         NA        NA 0.187375652 0.19409134
## Thailand         NA         NA        NA          NA 0.06532773
## Vietnam          NA         NA        NA          NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)

##         Var1           Var2        value        
##  Cambodia :11   Cambodia :11   Min.   :0.00633  
##  China    :11   China    :11   1st Qu.:0.08293  
##  India    :11   India    :11   Median :0.14144  
##  Indonesia:11   Indonesia:11   Mean   :0.14212  
##  Japan    :11   Japan    :11   3rd Qu.:0.18810  
##  Malaysia :11   Malaysia :11   Max.   :0.35078  
##  (Other)  :55   (Other)  :55   NA's   :66

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "pink",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.01_by_country.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)

## # A tibble: 25 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 OKI     0.152 
##  2 HAI     0.0932
##  3 YUN     0.0828
##  4 KAT     0.140 
##  5 TAI     0.211 
##  6 HUN     0.104 
##  7 HAN     0.103 
##  8 KLP     0.130 
##  9 HOC     0.104 
## 10 QNC     0.153 
## # ℹ 15 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)

# Rename the Region2 column to Region
fst3 <- fst3 |>
  dplyr::rename(Region = Region2)

# check output
head(fst3)

## # A tibble: 6 × 7
##   Pop_City   Country Latitude Longitude Abbreviation Region mean_fst
##   <chr>      <chr>      <dbl>     <dbl> <chr>        <chr>     <dbl>
## 1 Hainan     China       19.2      110. HAI          East     0.0932
## 2 Yunnan     China       24.5      101. YUN          East     0.0828
## 3 Hunan      China       27.6      112. HUN          East     0.104 
## 4 Okinawa    Japan       26.5      128. OKI          East     0.152 
## 5 Kanazawa   Japan       36.6      137. KAN          East     0.198 
## 6 Utsunomiya Japan       36.6      140. UTS          East     0.173

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 8
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0932
##  2 Yunnan      China       24.5      101.  YUN          East     0.0828
##  3 Hunan       China       27.6      112.  HUN          East     0.104 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.152 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.173 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.157 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.211 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0978
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.228 
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>

Mean by country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 9
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0932
##  2 Yunnan      China       24.5      101.  YUN          East     0.0828
##  3 Hunan       China       27.6      112.  HUN          East     0.104 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.152 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.173 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.157 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.211 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0978
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.228 
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City
  )

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 11
##    City        Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0932
##  2 Yunnan      China       24.5      101.  YUN          East     0.0828
##  3 Hunan       China       27.6      112.  HUN          East     0.104 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.152 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.173 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.157 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.211 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0978
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.228 
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## #   Latitude_group <chr>, mean_fst_by_latitude <dbl>

fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)

## # A tibble: 6 × 9
##   Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
##   <chr>                         <dbl> <chr>               <dbl> <chr>  
## 1 Above 30N                      0.18 East                 0.15 Japan  
## 2 Above 30N                      0.18 East                 0.15 Japan  
## 3 Above 30N                      0.18 East                 0.15 Japan  
## 4 Below 30N                      0.13 East                 0.15 China  
## 5 Below 30N                      0.13 East                 0.15 China  
## 6 Below 30N                      0.13 East                 0.15 China  
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## #   Abbreviation <chr>, mean_fst <dbl>

# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values using SNPs after prunning with r2 0.01.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table

Table 1. Fst values using SNPs after prunning with r2 0.01.
Latitude_group	mean_fst_by_latitude	Region	mean_fst_by_region	Country	mean_fst_by_country	City	Abbreviation	mean_fst
Above 30N	0.18	East	0.15	Japan	0.17	Kagoshima	KAG	0.16
Above 30N	0.18	East	0.15	Japan	0.17	Kanazawa	KAN	0.20
Above 30N	0.18	East	0.15	Japan	0.17	Utsunomiya	UTS	0.17
Below 30N	0.13	East	0.15	China	0.09	Hainan	HAI	0.09
Below 30N	0.13	East	0.15	China	0.09	Hunan	HUN	0.10
Below 30N	0.13	East	0.15	China	0.09	Yunnan	YUN	0.08
Below 30N	0.13	East	0.15	Japan	0.17	Okinawa	OKI	0.15
Below 30N	0.13	East	0.15	Taiwan	0.21	Tainan	TAI	0.21
Below 30N	0.13	South	0.16	India	0.10	Bengaluru	BEN	0.10
Below 30N	0.13	South	0.16	Maldives	0.23	Kunfunadhoo	KUN	0.23
Below 30N	0.13	South	0.16	Nepal	0.14	Kathmandu	KAT	0.14
Below 30N	0.13	Southeast	0.12	Cambodia	0.08	Phnom Penh	CAM	0.08
Below 30N	0.13	Southeast	0.12	Indonesia	0.16	Jakarta	INJ	0.16
Below 30N	0.13	Southeast	0.12	Indonesia	0.16	Sulawesi (Forest)	SUF	0.16
Below 30N	0.13	Southeast	0.12	Indonesia	0.16	Sulawesi (Urban)	SUU	0.12
Below 30N	0.13	Southeast	0.12	Indonesia	0.16	Wainyapu	INW	0.19
Below 30N	0.13	Southeast	0.12	Malaysia	0.11	Kuala Lumpur	KLP	0.13
Below 30N	0.13	Southeast	0.12	Malaysia	0.11	Tambun	MAT	0.09
Below 30N	0.13	Southeast	0.12	Thailand	0.09	Chanthaburi	CHA	0.08
Below 30N	0.13	Southeast	0.12	Thailand	0.09	Kanchanaburi	KAC	0.09
Below 30N	0.13	Southeast	0.12	Thailand	0.09	Lampang	LAM	0.09
Below 30N	0.13	Southeast	0.12	Thailand	0.09	Sisaket	SSK	0.08
Below 30N	0.13	Southeast	0.12	Vietnam	0.12	Hanoi	HAN	0.10
Below 30N	0.13	Southeast	0.12	Vietnam	0.12	Ho Chi Minh City	HOC	0.10
Below 30N	0.13	Southeast	0.12	Vietnam	0.12	Quy Nhon City	QNC	0.15

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "fst", 
  "fst_r2_0.01_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "fst", "r2_0.01_country.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)

##           HAI      YUN       HUN      OKI       KAN       UTS       KAG
## HAI    0.0000 1035.308  965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081    0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN  965.7432 1107.931    0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283    0.000 1390.3468 1589.7383  617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347    0.0000  288.4938  790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738  288.4938    0.0000 1023.2713
##           TAI      BEN      KUN      KAT       CAM      SUF      SUU      INW
## HAI 1180.1356 3487.311 4241.329 2648.016  987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN  985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI  873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
##          INJ      KLP      MAT       SSK      KAC      CHA       LAM       HAN
## HAI 2844.575 1984.417 1861.932  722.0761 1218.270 1086.825 1070.0656  441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303  714.4744  601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029  953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
##          HOC       QNC
## HAI  990.911  604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

head(data)

##    Distance        FST row_index col_index
## 1    0.0000 0.00000000       HAI       HAI
## 2 1035.3081 0.08801929       HAI       YUN
## 3  965.7432 0.12717041       HAI       HUN
## 4 2047.0214 0.19706597       HAI       OKI
## 5 3269.9948 0.16437593       HAI       KAN
## 6 3527.0034 0.07411078       HAI       UTS

Fit linear regression

# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))


# Plot
ggplot(data, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(data$Distance) * 0.85, y = max(data$FST) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Distance (Km)",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic()

## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_all_samples_LD1.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

## `geom_smooth()` using formula = 'y ~ x'

Subset by country Select countries with at least 3 sampling localities

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops

## [1] "China"     "Japan"     "Indonesia" "Thailand"  "Vietnam"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  subset_data <- subset_data[subset_data$Distance > 0, ]
  # Perform linear regression
  lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results

## $China
## $China$equation
## [1] "y = -0.64629x + 4.586"
## 
## $China$r2
## [1] "R^2 = 1.00"
## 
## 
## $Japan
## $Japan$equation
## [1] "y = 0.04826x + -0.142"
## 
## $Japan$r2
## [1] "R^2 = 0.10"
## 
## 
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.05298x + 0.518"
## 
## $Indonesia$r2
## [1] "R^2 = 0.15"
## 
## 
## $Thailand
## $Thailand$equation
## [1] "y = 0.29122x + -1.590"
## 
## $Thailand$r2
## [1] "R^2 = 0.40"
## 
## 
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.05230x + 0.485"
## 
## $Vietnam$r2
## [1] "R^2 = 0.16"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate the linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(
    data = regression_results, 
    aes(label = paste(equation, r2, sep = "~~")), 
    x = Inf, y = Inf, 
    vjust = 2, hjust = 1.2, 
    size = 3.5, 
    parse = TRUE, 
    inherit.aes = FALSE
  ) +
  labs(title = "FST vs Distance by Country",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_countries_LD1.pdf"),
  width = 6,
  height = 8,
  units = "in"
)

## `geom_smooth()` using formula = 'y ~ x'

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)

##     BEN    CAM    CHA    HAI    HAN    HOC    HUN    INJ    INW    KAC   
## BEN NA     "0.03" "0.03" "0.07" "0.07" "0.08" "0.10" "0.13" "0.16" "0.02"
## CAM "2976" NA     "0.00" "0.04" "0.04" "0.05" "0.07" "0.12" "0.15" "0.00"
## CHA "2660" "329"  NA     "0.05" "0.05" "0.06" "0.08" "0.12" "0.15" "0.00"
## HAI "3487" "987"  "1087" NA     "0.04" "0.03" "0.04" "0.15" "0.19" "0.05"
## HAN "3134" "1059" "1019" "441"  NA     "0.04" "0.05" "0.16" "0.21" "0.05"
## HOC "3174" "209"  "537"  "991"  "1146" NA     "0.04" "0.16" "0.20" "0.06"
## HUN "3913" "1929" "1956" "966"  "954"  "1954" NA     "0.16" "0.20" "0.08"
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA     "0.06" "0.13"
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA     "0.16"
## KAC "2377" "647"  "320"  "1218" "1028" "856"  "1981" "2393" "3492" NA    
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713"
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456"
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119"
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234"
## KUN "957"  "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052"
## LAM "2419" "951"  "692"  "1070" "733"  "1139" "1637" "2844" "3859" "475" 
## MAT "2751" "875"  "893"  "1862" "1894" "912"  "2799" "1363" "2650" "1060"
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265"
## QNC "3424" "528"  "782"  "604"  "882"  "436"  "1565" "2241" "2880" "1047"
## SSK "2894" "402"  "368"  "722"  "678"  "547"  "1595" "2390" "3271" "530" 
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859"  "2874"
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973"  "3147"
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986"  "3562" "3635" "2397"
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722"
## YUN "2806" "1487" "1323" "1035" "601"  "1627" "1108" "3467" "4320" "1179"
##     KAG    KAN    KAT    KLP    KUN    LAM    MAT    OKI    QNC    SSK   
## BEN "0.16" "0.20" "0.10" "0.08" "0.13" "0.03" "0.04" "0.15" "0.10" "0.03"
## CAM "0.13" "0.17" "0.10" "0.05" "0.15" "0.01" "0.02" "0.11" "0.08" "0.01"
## CHA "0.14" "0.18" "0.10" "0.05" "0.15" "0.01" "0.02" "0.13" "0.08" "0.00"
## HAI "0.10" "0.14" "0.13" "0.09" "0.19" "0.06" "0.06" "0.09" "0.12" "0.06"
## HAN "0.12" "0.17" "0.13" "0.09" "0.22" "0.05" "0.06" "0.11" "0.13" "0.05"
## HOC "0.11" "0.15" "0.14" "0.10" "0.22" "0.07" "0.07" "0.10" "0.14" "0.06"
## HUN "0.07" "0.11" "0.15" "0.11" "0.21" "0.08" "0.09" "0.07" "0.14" "0.08"
## INJ "0.22" "0.26" "0.09" "0.17" "0.26" "0.13" "0.13" "0.21" "0.19" "0.12"
## INW "0.26" "0.32" "0.10" "0.23" "0.33" "0.16" "0.16" "0.26" "0.23" "0.15"
## KAC "0.14" "0.19" "0.10" "0.06" "0.17" "0.01" "0.02" "0.13" "0.08" "0.00"
## KAG NA     "0.12" "0.21" "0.18" "0.28" "0.15" "0.15" "0.12" "0.20" "0.14"
## KAN "791"  NA     "0.25" "0.23" "0.33" "0.19" "0.19" "0.16" "0.25" "0.18"
## KAT "4367" "4880" NA     "0.15" "0.23" "0.10" "0.11" "0.20" "0.17" "0.10"
## KLP "4376" "5163" "3242" NA     "0.24" "0.06" "0.06" "0.17" "0.14" "0.06"
## KUN "6631" "7300" "2779" "3196" NA     "0.17" "0.17" "0.27" "0.24" "0.16"
## LAM "3450" "4160" "1789" "1704" "3202" NA     "0.02" "0.13" "0.09" "0.01"
## MAT "4289" "5073" "3069" "177"  "3124" "1531" NA     "0.14" "0.10" "0.02"
## OKI "618"  "1390" "4204" "3822" "6262" "3060" "3747" NA     "0.19" "0.13"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA     "0.08"
## SSK "3232" "3991" "2412" "1365" "3582" "624"  "1217" "2758" "548"  NA    
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873"  "1548" "1886"
## UTS "1023" "288"  "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714"  "2208" "2678" "1449" "1087"
##     SUF    SUU    TAI    UTS    YUN    
## BEN "0.14" "0.11" "0.20" "0.17" "0.02" 
## CAM "0.13" "0.09" "0.17" "0.14" "0.00" 
## CHA "0.13" "0.09" "0.18" "0.16" "0.00" 
## HAI "0.15" "0.09" "0.14" "0.11" "0.05" 
## HAN "0.17" "0.10" "0.18" "0.14" "0.05" 
## HOC "0.16" "0.10" "0.15" "0.12" "0.06" 
## HUN "0.16" "0.10" "0.13" "0.09" "0.08" 
## INJ "0.08" "0.09" "0.27" "0.24" "0.12" 
## INW "0.09" "0.12" "0.34" "0.28" "0.15" 
## KAC "0.14" "0.10" "0.20" "0.17" "-0.00"
## KAG "0.23" "0.16" "0.18" "0.09" "0.14" 
## KAN "0.27" "0.21" "0.22" "0.10" "0.18" 
## KAT "0.06" "0.07" "0.26" "0.22" "0.10" 
## KLP "0.19" "0.14" "0.24" "0.20" "0.06" 
## KUN "0.29" "0.25" "0.35" "0.30" "0.16" 
## LAM "0.14" "0.10" "0.19" "0.17" "0.01" 
## MAT "0.14" "0.10" "0.19" "0.17" "0.02" 
## OKI "0.22" "0.15" "0.16" "0.13" "0.13" 
## QNC "0.21" "0.17" "0.25" "0.22" "0.08" 
## SSK "0.13" "0.10" "0.18" "0.16" "0.00" 
## SUF NA     "0.04" "0.28" "0.24" "0.13" 
## SUU "381"  NA     "0.22" "0.18" "0.10" 
## TAI "2777" "2747" NA     "0.20" "0.18" 
## UTS "4755" "4566" "2418" NA     "0.16" 
## YUN "3566" "3744" "1928" "3902" NA

cities <- readRDS(here("output", "populations", "cities.rds"))
head(cities)

## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)

##     SSK    QNC    CAM    HOC    CHA    HAI    HAN    KAC    LAM    YUN   
## SSK NA     "0.08" "0.01" "0.06" "0.00" "0.06" "0.05" "0.00" "0.01" "0.00"
## QNC "548"  NA     "0.08" "0.14" "0.08" "0.12" "0.13" "0.08" "0.09" "0.08"
## CAM "402"  "528"  NA     "0.05" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## HOC "547"  "436"  "209"  NA     "0.06" "0.03" "0.04" "0.06" "0.07" "0.06"
## CHA "368"  "782"  "329"  "537"  NA     "0.05" "0.05" "0.00" "0.01" "0.00"
## HAI "722"  "604"  "987"  "991"  "1087" NA     "0.04" "0.05" "0.06" "0.05"
## HAN "678"  "882"  "1059" "1146" "1019" "441"  NA     "0.05" "0.05" "0.05"
## KAC "530"  "1047" "647"  "856"  "320"  "1218" "1028" NA     "0.01" "0.00"
## LAM "624"  "1155" "951"  "1139" "692"  "1070" "733"  "475"  NA     "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601"  "1179" "714"  NA    
## MAT "1217" "1348" "875"  "912"  "893"  "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966"  "954"  "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
##     MAT    HUN    KLP    TAI    OKI    SUF    INJ    SUU    KAT    KAG   
## SSK "0.02" "0.08" "0.06" "0.18" "0.13" "0.13" "0.12" "0.10" "0.10" "0.14"
## QNC "0.10" "0.14" "0.14" "0.25" "0.19" "0.21" "0.19" "0.17" "0.17" "0.20"
## CAM "0.02" "0.07" "0.05" "0.17" "0.11" "0.13" "0.12" "0.09" "0.10" "0.13"
## HOC "0.07" "0.04" "0.10" "0.15" "0.10" "0.16" "0.16" "0.10" "0.14" "0.11"
## CHA "0.02" "0.08" "0.05" "0.18" "0.13" "0.13" "0.12" "0.09" "0.10" "0.14"
## HAI "0.06" "0.04" "0.09" "0.14" "0.09" "0.15" "0.15" "0.09" "0.13" "0.10"
## HAN "0.06" "0.05" "0.09" "0.18" "0.11" "0.17" "0.16" "0.10" "0.13" "0.12"
## KAC "0.02" "0.08" "0.06" "0.20" "0.13" "0.14" "0.13" "0.10" "0.10" "0.14"
## LAM "0.02" "0.08" "0.06" "0.19" "0.13" "0.14" "0.13" "0.10" "0.10" "0.15"
## YUN "0.02" "0.08" "0.06" "0.18" "0.13" "0.13" "0.12" "0.10" "0.10" "0.14"
## MAT NA     "0.09" "0.06" "0.19" "0.14" "0.14" "0.13" "0.10" "0.11" "0.15"
## HUN "2799" NA     "0.11" "0.13" "0.07" "0.16" "0.16" "0.10" "0.15" "0.07"
## KLP "177"  "2932" NA     "0.24" "0.17" "0.19" "0.17" "0.14" "0.15" "0.18"
## TAI "2894" "986"  "2979" NA     "0.16" "0.28" "0.27" "0.22" "0.26" "0.18"
## OKI "3747" "1599" "3822" "873"  NA     "0.22" "0.21" "0.15" "0.20" "0.12"
## SUF "2220" "3406" "2115" "2777" "3280" NA     "0.08" "0.04" "0.06" "0.23"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA     "0.09" "0.09" "0.22"
## SUU "2565" "3468" "2469" "2747" "3154" "381"  "1911" NA     "0.07" "0.16"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA     "0.21"
## KAG "4289" "1860" "4376" "1397" "618"  "3893" "4901" "3756" "4367" NA    
## INW "2650" "4248" "2505" "3635" "4110" "859"  "1528" "973"  "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791" 
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
##     INW    BEN    KAN    UTS    KUN   
## SSK "0.15" "0.03" "0.18" "0.16" "0.16"
## QNC "0.23" "0.10" "0.25" "0.22" "0.24"
## CAM "0.15" "0.03" "0.17" "0.14" "0.15"
## HOC "0.20" "0.08" "0.15" "0.12" "0.22"
## CHA "0.15" "0.03" "0.18" "0.16" "0.15"
## HAI "0.19" "0.07" "0.14" "0.11" "0.19"
## HAN "0.21" "0.07" "0.17" "0.14" "0.22"
## KAC "0.16" "0.02" "0.19" "0.17" "0.17"
## LAM "0.16" "0.03" "0.19" "0.17" "0.17"
## YUN "0.15" "0.02" "0.18" "0.16" "0.16"
## MAT "0.16" "0.04" "0.19" "0.17" "0.17"
## HUN "0.20" "0.10" "0.11" "0.09" "0.21"
## KLP "0.23" "0.08" "0.23" "0.20" "0.24"
## TAI "0.34" "0.20" "0.22" "0.20" "0.35"
## OKI "0.26" "0.15" "0.16" "0.13" "0.27"
## SUF "0.09" "0.14" "0.27" "0.24" "0.29"
## INJ "0.06" "0.13" "0.26" "0.24" "0.26"
## SUU "0.12" "0.11" "0.21" "0.18" "0.25"
## KAT "0.10" "0.10" "0.25" "0.22" "0.23"
## KAG "0.26" "0.16" "0.12" "0.09" "0.28"
## INW NA     "0.16" "0.32" "0.28" "0.33"
## BEN "5347" NA     "0.20" "0.17" "0.13"
## KAN "5424" "6410" NA     "0.10" "0.33"
## UTS "5539" "6696" "288"  NA     "0.30"
## KUN "5513" "957"  "7300" "7583" NA

Make a table and save as word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]


# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))

ft

Population	SSK	QNC	CAM	HOC	CHA	HAI	HAN	KAC	LAM	YUN	MAT	HUN	KLP	TAI	OKI	SUF	INJ	SUU	KAT	KAG	INW	BEN	KAN	UTS	KUN
SSK		0.08	0.01	0.06	0.00	0.06	0.05	0.00	0.01	0.00	0.02	0.08	0.06	0.18	0.13	0.13	0.12	0.10	0.10	0.14	0.15	0.03	0.18	0.16	0.16
QNC	548		0.08	0.14	0.08	0.12	0.13	0.08	0.09	0.08	0.10	0.14	0.14	0.25	0.19	0.21	0.19	0.17	0.17	0.20	0.23	0.10	0.25	0.22	0.24
CAM	402	528		0.05	0.00	0.04	0.04	0.00	0.01	0.00	0.02	0.07	0.05	0.17	0.11	0.13	0.12	0.09	0.10	0.13	0.15	0.03	0.17	0.14	0.15
HOC	547	436	209		0.06	0.03	0.04	0.06	0.07	0.06	0.07	0.04	0.10	0.15	0.10	0.16	0.16	0.10	0.14	0.11	0.20	0.08	0.15	0.12	0.22
CHA	368	782	329	537		0.05	0.05	0.00	0.01	0.00	0.02	0.08	0.05	0.18	0.13	0.13	0.12	0.09	0.10	0.14	0.15	0.03	0.18	0.16	0.15
HAI	722	604	987	991	1087		0.04	0.05	0.06	0.05	0.06	0.04	0.09	0.14	0.09	0.15	0.15	0.09	0.13	0.10	0.19	0.07	0.14	0.11	0.19
HAN	678	882	1059	1146	1019	441		0.05	0.05	0.05	0.06	0.05	0.09	0.18	0.11	0.17	0.16	0.10	0.13	0.12	0.21	0.07	0.17	0.14	0.22
KAC	530	1047	647	856	320	1218	1028		0.01	0.00	0.02	0.08	0.06	0.20	0.13	0.14	0.13	0.10	0.10	0.14	0.16	0.02	0.19	0.17	0.17
LAM	624	1155	951	1139	692	1070	733	475		0.01	0.02	0.08	0.06	0.19	0.13	0.14	0.13	0.10	0.10	0.15	0.16	0.03	0.19	0.17	0.17
YUN	1087	1449	1487	1627	1323	1035	601	1179	714		0.02	0.08	0.06	0.18	0.13	0.13	0.12	0.10	0.10	0.14	0.15	0.02	0.18	0.16	0.16
MAT	1217	1348	875	912	893	1862	1894	1060	1531	2208		0.09	0.06	0.19	0.14	0.14	0.13	0.10	0.11	0.15	0.16	0.04	0.19	0.17	0.17
HUN	1595	1565	1929	1954	1956	966	954	1981	1637	1108	2799		0.11	0.13	0.07	0.16	0.16	0.10	0.15	0.07	0.20	0.10	0.11	0.09	0.21
KLP	1365	1445	1003	1011	1055	1984	2042	1234	1704	2375	177	2932		0.24	0.17	0.19	0.17	0.14	0.15	0.18	0.23	0.08	0.23	0.20	0.24
TAI	1886	1548	2061	1982	2237	1180	1497	2397	2219	1928	2894	986	2979		0.16	0.28	0.27	0.22	0.26	0.18	0.34	0.20	0.22	0.20	0.35
OKI	2758	2410	2929	2839	3110	2047	2328	3265	3060	2678	3747	1599	3822	873		0.22	0.21	0.15	0.20	0.12	0.26	0.15	0.16	0.13	0.27
SUF	2567	2117	2245	2046	2557	2614	2986	2874	3182	3566	2220	3406	2115	2777	3280		0.08	0.04	0.06	0.23	0.09	0.14	0.27	0.24	0.29
INJ	2390	2241	1989	1889	2160	2845	3034	2393	2844	3467	1363	3805	1188	3562	4294	1535		0.09	0.09	0.22	0.06	0.13	0.26	0.24	0.26
SUU	2797	2309	2506	2300	2827	2752	3151	3147	3420	3744	2565	3468	2469	2747	3154	381	1911		0.07	0.16	0.12	0.11	0.21	0.18	0.25
KAT	2412	2922	2724	2921	2424	2648	2207	2119	1789	1640	3069	2611	3242	3537	4204	4967	4429	5209		0.21	0.10	0.10	0.25	0.22	0.23
KAG	3232	2942	3446	3377	3595	2510	2721	3713	3450	2967	4289	1860	4376	1397	618	3893	4901	3756	4367		0.26	0.16	0.12	0.09	0.28
INW	3271	2880	2908	2727	3190	3419	3762	3492	3859	4320	2650	4248	2505	3635	4110	859	1528	973	5611	4718		0.16	0.32	0.28	0.33
BEN	2894	3424	2976	3174	2660	3487	3134	2377	2419	2806	2751	3913	2869	4631	5450	4970	3876	5307	1828	5773	5347		0.20	0.17	0.13
KAN	3991	3725	4223	4161	4357	3270	3444	4456	4160	3619	5073	2532	5163	2185	1390	4621	5685	4452	4880	791	5424	6410		0.10	0.33
UTS	4249	3965	4470	4399	4613	3527	3715	4722	4435	3902	5311	2811	5396	2418	1590	4755	5874	4566	5167	1023	5539	6696	288		0.30
KUN	3582	4070	3572	3749	3288	4241	3934	3052	3202	3681	3124	4777	3196	5414	6262	5293	3986	5658	2779	6631	5513	957	7300	7583

# Create a new Word document
doc <- read_docx()

# Add the flextable to the Word document
doc <- body_add_flextable(doc, ft)

# Save the Word document
print(doc, target =  here("output", "fst", "LD1.docx"))

3. SNP set r2 0.1

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()

##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  6244094 333.5   11233748 600.0         NA 10670513 569.9
## Vcells 10548729  80.5   18029388 137.6      32768 14957815 114.2

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.1 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.1.log

## 58152 variants loaded from .bim file.
## --keep-fam: 230 people remaining.
## Total genotyping rate in remaining samples is 0.972163.
## 58152 variants and 230 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

r2_0.1 <-
  read.PLINK(
    here(
      "output", "fst", "r2_0.1.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

summary(r2_0.1)

This chunk will take a couple minutes to run.

# convert
r2_0.1_2 <- stamppConvert(r2_0.1, type="genlight")

# run stampp. If you want to runn with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
r2_0.1_3 <- stamppFst(r2_0.1_2, 1, 95, 1)

Save it

saveRDS(
  r2_0.1_3, here(
    "output", "fst", "r2_0.1.rds"
  )
)

To load it

r2_0.1_3 <- readRDS(
  here(
    "output", "fst", "r2_0.1.rds"
  )
)

Now lets look at the object

summary(r2_0.1_3)

##       OKI               HAI               YUN                 KAT         
##  Min.   :0.07442   Min.   :0.03415   Min.   :-0.000061   Min.   :0.06197  
##  1st Qu.:0.11875   1st Qu.:0.04948   1st Qu.: 0.020653   1st Qu.:0.10390  
##  Median :0.13231   Median :0.08208   Median : 0.074156   Median :0.11475  
##  Mean   :0.14973   Mean   :0.09324   Mean   : 0.081748   Mean   :0.14294  
##  3rd Qu.:0.17241   3rd Qu.:0.13763   3rd Qu.: 0.138696   3rd Qu.:0.17543  
##  Max.   :0.26203   Max.   :0.19423   Max.   : 0.184499   Max.   :0.26064  
##  NA's   :1         NA's   :2         NA's   :3           NA's   :4        
##       TAI              HUN               HAN               KLP         
##  Min.   :0.1335   Min.   :0.04272   Min.   :0.03747   Min.   :0.05153  
##  1st Qu.:0.1827   1st Qu.:0.07029   1st Qu.:0.04754   1st Qu.:0.05847  
##  Median :0.1963   Median :0.08814   Median :0.09929   Median :0.13852  
##  Mean   :0.2179   Mean   :0.10291   Mean   :0.10694   Mean   :0.13005  
##  3rd Qu.:0.2400   3rd Qu.:0.12577   3rd Qu.:0.15623   3rd Qu.:0.18669  
##  Max.   :0.3444   Max.   :0.20465   Max.   :0.22103   Max.   :0.24103  
##  NA's   :5        NA's   :6         NA's   :7         NA's   :8        
##       HOC               QNC               SSK                KAC          
##  Min.   :0.04642   Min.   :0.07948   Min.   :0.002698   Min.   :0.001375  
##  1st Qu.:0.06326   1st Qu.:0.08570   1st Qu.:0.011550   1st Qu.:0.018868  
##  Median :0.10400   Median :0.17367   Median :0.113973   Median :0.132199  
##  Mean   :0.11322   Mean   :0.15507   Mean   :0.087070   Mean   :0.095838  
##  3rd Qu.:0.15567   3rd Qu.:0.21453   3rd Qu.:0.147997   3rd Qu.:0.153453  
##  Max.   :0.21166   Max.   :0.24452   Max.   :0.179032   Max.   :0.184788  
##  NA's   :9         NA's   :10        NA's   :11         NA's   :12        
##       CHA                KAN              UTS               CAM          
##  Min.   :0.004038   Min.   :0.1031   Min.   :0.08997   Min.   :0.008172  
##  1st Qu.:0.023675   1st Qu.:0.1759   1st Qu.:0.15738   1st Qu.:0.025363  
##  Median :0.128982   Median :0.1946   Median :0.17412   Median :0.121331  
##  Mean   :0.099625   Mean   :0.2137   Mean   :0.19413   Mean   :0.094094  
##  3rd Qu.:0.149675   3rd Qu.:0.2701   3rd Qu.:0.24146   3rd Qu.:0.137461  
##  Max.   :0.177767   Max.   :0.3177   Max.   :0.28404   Max.   :0.161974  
##  NA's   :13         NA's   :14       NA's   :15        NA's   :16        
##       KAG              BEN               LAM               SUF         
##  Min.   :0.1362   Min.   :0.02818   Min.   :0.01876   Min.   :0.04022  
##  1st Qu.:0.1471   1st Qu.:0.07735   1st Qu.:0.10946   1st Qu.:0.07931  
##  Median :0.1938   Median :0.13012   Median :0.13912   Median :0.09606  
##  Mean   :0.1963   Mean   :0.11092   Mean   :0.12277   Mean   :0.13239  
##  3rd Qu.:0.2366   3rd Qu.:0.14495   3rd Qu.:0.16100   3rd Qu.:0.15057  
##  Max.   :0.2646   Max.   :0.17353   Max.   :0.17178   Max.   :0.29578  
##  NA's   :17       NA's   :18        NA's   :19        NA's   :20       
##       SUU               KUN              INW               INJ        
##  Min.   :0.08678   Min.   :0.1681   Min.   :0.06515   Min.   :0.1366  
##  1st Qu.:0.10211   1st Qu.:0.2155   1st Qu.:0.09282   1st Qu.:0.1366  
##  Median :0.11800   Median :0.2629   Median :0.12049   Median :0.1366  
##  Mean   :0.14286   Mean   :0.2578   Mean   :0.12049   Mean   :0.1366  
##  3rd Qu.:0.15875   3rd Qu.:0.3027   3rd Qu.:0.14816   3rd Qu.:0.1366  
##  Max.   :0.24867   Max.   :0.3425   Max.   :0.17582   Max.   :0.1366  
##  NA's   :21        NA's   :22       NA's   :23        NA's   :24      
##       MAT     
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :25

If you want you can save the fst values as csv.

# Convert to data frame
r2_0.1_df <- data.frame(r2_0.1_3)

# Save it
write.csv(r2_0.1_df, file = here("output", "fst", "r2_0.1_df.csv"))

Check the Fst values

head(r2_0.1_df)

##            OKI        HAI        YUN       KAT       TAI HUN HAN KLP HOC QNC
## OKI         NA         NA         NA        NA        NA  NA  NA  NA  NA  NA
## HAI 0.08802883         NA         NA        NA        NA  NA  NA  NA  NA  NA
## YUN 0.11878133 0.04941474         NA        NA        NA  NA  NA  NA  NA  NA
## KAT 0.19738113 0.13464192 0.10181861        NA        NA  NA  NA  NA  NA  NA
## TAI 0.16804359 0.14154214 0.18449852 0.2606445        NA  NA  NA  NA  NA  NA
## HUN 0.07441617 0.03534541 0.06802916 0.1476221 0.1334822  NA  NA  NA  NA  NA
##     SSK KAC CHA KAN UTS CAM KAG BEN LAM SUF SUU KUN INW INJ MAT
## OKI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## YUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KAT  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TAI  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## HUN  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Now lets get the Fst values from the object albo3. It has the bootstraps, CI limits, p-values, and Fst values. We will convert the data into a matrix.

aa <- as.matrix(r2_0.1_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)

##            OKI        HAI        YUN       KAT       TAI        HUN        HAN
## OKI         NA 0.08802883 0.11878133 0.1973811 0.1680436 0.07441617 0.10763474
## HAI 0.08802883         NA 0.04941474 0.1346419 0.1415421 0.03534541 0.03825566
## YUN 0.11878133 0.04941474         NA 0.1018186 0.1844985 0.06802916 0.04338869
## KAT 0.19738113 0.13464192 0.10181861        NA 0.2606445 0.14762208 0.13924461
## TAI 0.16804359 0.14154214 0.18449852 0.2606445        NA 0.13348223 0.18073656
## HUN 0.07441617 0.03534541 0.06802916 0.1476221 0.1334822         NA 0.04712316
##            KLP        HOC       QNC         SSK           KAC        CHA
## OKI 0.15791274 0.09583416 0.1855034 0.120337901  1.222639e-01 0.11864070
## HAI 0.08207801 0.03414611 0.1192940 0.051748990  4.855596e-02 0.04954779
## YUN 0.05497547 0.05872326 0.0802834 0.003470757 -6.095245e-05 0.00216753
## KAT 0.14992931 0.14537279 0.1754255 0.103904742  1.043844e-01 0.10291812
## TAI 0.23665704 0.15932868 0.2500647 0.183363174  1.938385e-01 0.18037855
## HUN 0.09955817 0.04272361 0.1355892 0.070972254  6.854084e-02 0.06961833
##           KAN        UTS         CAM        KAG        BEN         LAM
## OKI 0.1661065 0.13562198 0.109049041 0.12082719 0.13838680 0.123949616
## HAI 0.1406218 0.11163440 0.041994919 0.09877636 0.06924561 0.053818795
## YUN 0.1797822 0.14989155 0.003581621 0.13178798 0.02349872 0.006781344
## KAT 0.2507648 0.22214457 0.101256096 0.20504141 0.10811511 0.106234969
## TAI 0.2282426 0.19867600 0.171069919 0.18782724 0.20034447 0.189514500
## HUN 0.1159453 0.08822364 0.059878361 0.07511422 0.08813833 0.073799872
##            SUF        SUU       KUN       INW        INJ        MAT
## OKI 0.22063500 0.15757698 0.2620293 0.2615279 0.21402287 0.12899644
## HAI 0.15560449 0.09438831 0.1867310 0.1942290 0.15201957 0.06079123
## YUN 0.14099822 0.09767201 0.1564464 0.1629609 0.12806009 0.01970472
## KAT 0.06196652 0.07006897 0.2304904 0.1073455 0.09416537 0.11474970
## TAI 0.29024752 0.22479493 0.3443908 0.3418771 0.27387778 0.18974486
## HUN 0.16664122 0.10268620 0.2017426 0.2046510 0.16392725 0.08049047

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    Region2, Country
  )

# Check it
head(sampling_loc)

## # A tibble: 6 × 7
##   Pop_City Country Latitude Longitude Region Abbreviation Region2  
##   <chr>    <chr>      <dbl>     <dbl> <chr>  <chr>        <chr>    
## 1 Hainan   China       19.2      110. Asia   HAI          East Asia
## 2 Yunnan   China       24.5      101. Asia   YUN          East Asia
## 3 Hunan    China       27.6      112. Asia   HUN          East Asia
## 4 Okinawa  Japan       26.5      128. Asia   OKI          East Asia
## 5 Sendai   Japan       38.3      141. Asia   SEN          East Asia
## 6 Nagasaki Japan       32.8      130. Asia   NAG          East Asia

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops

##  [1] "HAI" "YUN" "HUN" "OKI" "SEN" "NAG" "SAK" "HIR" "KAN" "YAT" "KYO" "NIG"
## [13] "UTS" "AIZ" "KHO" "SAG" "KAG" "JAT" "TAN" "TAI" "GEL" "BEN" "KUN" "KAT"
## [25] "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC" "SON" "CHA"
## [37] "LAM" "HAN" "HOC" "QNC" "ALV" "POR" "ANT" "MAD" "AWK" "TIK" "BAR" "SAI"
## [49] "PAL" "LOS"

Create vector with order of populations

# Extract the populations that appear in neutral_df
populations_in_r2_0.1 <- colnames(r2_0.1_df)

# Reorder the populations based on order_pops
poporder <- populations_in_r2_0.1[populations_in_r2_0.1 %in% order_pops]

# Print the reordered populations
print(poporder)

##  [1] "OKI" "HAI" "YUN" "KAT" "TAI" "HUN" "HAN" "KLP" "HOC" "QNC" "SSK" "KAC"
## [13] "CHA" "KAN" "UTS" "CAM" "KAG" "BEN" "LAM" "SUF" "SUU" "KUN" "INW" "INJ"
## [25] "MAT"

Lets check if the matrix is symmetric.

isSymmetric(aa)

## [1] TRUE

Now lets order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)

##       Var1          Var2         value        
##  OKI    : 25   OKI    : 25   Min.   :-0.0001  
##  HAI    : 25   HAI    : 25   1st Qu.: 0.0749  
##  YUN    : 25   YUN    : 25   Median : 0.1320  
##  KAT    : 25   KAT    : 25   Mean   : 0.1313  
##  TAI    : 25   TAI    : 25   3rd Qu.: 0.1763  
##  HUN    : 25   HUN    : 25   Max.   : 0.3444  
##  (Other):475   (Other):475   NA's   :325

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.1.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

By country

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(r2_0.1_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)

##              Cambodia      China      India  Indonesia     Japan   Malaysia
## Cambodia           NA 0.03515163 0.02536262 0.12941643 0.1297409 0.03646673
## China     0.035151634         NA 0.06029422 0.14698652 0.1075254 0.06626635
## India     0.025362622 0.06029422         NA 0.14413628 0.1619263 0.05902305
## Indonesia 0.129416433 0.14698652 0.14413628         NA 0.2343967 0.16434783
## Japan     0.129740854 0.10752536 0.16192628 0.23439668        NA 0.17315795
## Malaysia  0.036466732 0.06626635 0.05902305 0.16434783 0.1731579         NA
## Maldives  0.152913626 0.18164001 0.13012441 0.28745228 0.2816610 0.20454714
## Nepal     0.101256096 0.13292617 0.10811511 0.08338658 0.2116824 0.13233951
## Taiwan    0.171069919 0.14825128 0.20034447 0.28269932 0.1864794 0.21320095
## Thailand  0.006041354 0.04158013 0.02653178 0.13544451 0.1460455 0.03823662
## Vietnam   0.054734247 0.06661412 0.08278023 0.17914574 0.1451190 0.09840066
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1529136 0.10125610 0.1710699 0.006041354 0.05473425
## China     0.1816400 0.13292617 0.1482513 0.041580126 0.06661412
## India     0.1301244 0.10811511 0.2003445 0.026531776 0.08278023
## Indonesia 0.2874523 0.08338658 0.2826993 0.135444507 0.17914574
## Japan     0.2816610 0.21168235 0.1864794 0.146045512 0.14511899
## Malaysia  0.2045471 0.13233951 0.2132009 0.038236622 0.09840066
## Maldives         NA 0.23049036 0.3443908 0.161208330 0.22357228
## Nepal     0.2304904         NA 0.2606445 0.104360558 0.15334765
## Taiwan    0.3443908 0.26064445        NA 0.186773674 0.19670999
## Thailand  0.1612083 0.10436056 0.1867737          NA 0.06323481
## Vietnam   0.2235723 0.15334765 0.1967100 0.063234806         NA

symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)

##           Cambodia      China      India Indonesia     Japan   Malaysia
## Cambodia        NA 0.03515163 0.02536262 0.1294164 0.1297409 0.03646673
## China           NA         NA 0.06029422 0.1469865 0.1075254 0.06626635
## India           NA         NA         NA 0.1441363 0.1619263 0.05902305
## Indonesia       NA         NA         NA        NA 0.2343967 0.16434783
## Japan           NA         NA         NA        NA        NA 0.17315795
## Malaysia        NA         NA         NA        NA        NA         NA
## Maldives        NA         NA         NA        NA        NA         NA
## Nepal           NA         NA         NA        NA        NA         NA
## Taiwan          NA         NA         NA        NA        NA         NA
## Thailand        NA         NA         NA        NA        NA         NA
## Vietnam         NA         NA         NA        NA        NA         NA
##            Maldives      Nepal    Taiwan    Thailand    Vietnam
## Cambodia  0.1529136 0.10125610 0.1710699 0.006041354 0.05473425
## China     0.1816400 0.13292617 0.1482513 0.041580126 0.06661412
## India     0.1301244 0.10811511 0.2003445 0.026531776 0.08278023
## Indonesia 0.2874523 0.08338658 0.2826993 0.135444507 0.17914574
## Japan     0.2816610 0.21168235 0.1864794 0.146045512 0.14511899
## Malaysia  0.2045471 0.13233951 0.2132009 0.038236622 0.09840066
## Maldives         NA 0.23049036 0.3443908 0.161208330 0.22357228
## Nepal            NA         NA 0.2606445 0.104360558 0.15334765
## Taiwan           NA         NA        NA 0.186773674 0.19670999
## Thailand         NA         NA        NA          NA 0.06323481
## Vietnam          NA         NA        NA          NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)

##         Var1           Var2        value        
##  Cambodia :11   Cambodia :11   Min.   :0.00604  
##  China    :11   China    :11   1st Qu.:0.08308  
##  India    :11   India    :11   Median :0.14512  
##  Indonesia:11   Indonesia:11   Mean   :0.14236  
##  Japan    :11   Japan    :11   3rd Qu.:0.18663  
##  Malaysia :11   Malaysia :11   Max.   :0.34439  
##  (Other)  :55   (Other)  :55   NA's   :66

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "orange",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "fst", "fst_matrix_r2_0.1_by_country.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)

## # A tibble: 25 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 OKI     0.150 
##  2 HAI     0.0930
##  3 YUN     0.0819
##  4 KAT     0.143 
##  5 TAI     0.213 
##  6 HUN     0.101 
##  7 HAN     0.103 
##  8 KLP     0.128 
##  9 HOC     0.104 
## 10 QNC     0.153 
## # ℹ 15 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

# Remove " Asia" from the Region2 column
fst3$Region2 <- gsub(" Asia", "", fst3$Region2)

# Rename the Region2 column to Region
fst3 <- fst3 |>
  dplyr::rename(Region = Region2)

# check output
head(fst3)

## # A tibble: 6 × 7
##   Pop_City   Country Latitude Longitude Abbreviation Region mean_fst
##   <chr>      <chr>      <dbl>     <dbl> <chr>        <chr>     <dbl>
## 1 Hainan     China       19.2      110. HAI          East     0.0930
## 2 Yunnan     China       24.5      101. YUN          East     0.0819
## 3 Hunan      China       27.6      112. HUN          East     0.101 
## 4 Okinawa    Japan       26.5      128. OKI          East     0.150 
## 5 Kanazawa   Japan       36.6      137. KAN          East     0.198 
## 6 Utsunomiya Japan       36.6      140. UTS          East     0.169

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 8
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0930
##  2 Yunnan      China       24.5      101.  YUN          East     0.0819
##  3 Hunan       China       27.6      112.  HUN          East     0.101 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.150 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.169 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.154 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.213 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0974
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.225 
## # ℹ 15 more rows
## # ℹ 1 more variable: mean_fst_by_region <dbl>

Mean by country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 9
##    Pop_City    Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0930
##  2 Yunnan      China       24.5      101.  YUN          East     0.0819
##  3 Hunan       China       27.6      112.  HUN          East     0.101 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.150 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.169 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.154 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.213 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0974
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.225 
## # ℹ 15 more rows
## # ℹ 2 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 30, "Above 30N", "Below 30N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City
  )

# Print the modified fst3 tibble
print(fst3)

## # A tibble: 25 × 11
##    City        Country  Latitude Longitude Abbreviation Region mean_fst
##    <chr>       <chr>       <dbl>     <dbl> <chr>        <chr>     <dbl>
##  1 Hainan      China       19.2      110.  HAI          East     0.0930
##  2 Yunnan      China       24.5      101.  YUN          East     0.0819
##  3 Hunan       China       27.6      112.  HUN          East     0.101 
##  4 Okinawa     Japan       26.5      128.  OKI          East     0.150 
##  5 Kanazawa    Japan       36.6      137.  KAN          East     0.198 
##  6 Utsunomiya  Japan       36.6      140.  UTS          East     0.169 
##  7 Kagoshima   Japan       31.6      131.  KAG          East     0.154 
##  8 Tainan      Taiwan      23.0      120.  TAI          East     0.213 
##  9 Bengaluru   India       13.0       77.6 BEN          South    0.0974
## 10 Kunfunadhoo Maldives     5.67      73   KUN          South    0.225 
## # ℹ 15 more rows
## # ℹ 4 more variables: mean_fst_by_region <dbl>, mean_fst_by_country <dbl>,
## #   Latitude_group <chr>, mean_fst_by_latitude <dbl>

fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)

## # A tibble: 6 × 9
##   Latitude_group mean_fst_by_latitude Region mean_fst_by_region Country
##   <chr>                         <dbl> <chr>               <dbl> <chr>  
## 1 Above 30N                      0.17 East                 0.14 Japan  
## 2 Above 30N                      0.17 East                 0.14 Japan  
## 3 Above 30N                      0.17 East                 0.14 Japan  
## 4 Below 30N                      0.13 East                 0.14 China  
## 5 Below 30N                      0.13 East                 0.14 China  
## 6 Below 30N                      0.13 East                 0.14 China  
## # ℹ 4 more variables: mean_fst_by_country <dbl>, City <chr>,
## #   Abbreviation <chr>, mean_fst <dbl>

# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values using SNPs after prunning with r2 0.1.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table

Table 1. Fst values using SNPs after prunning with r2 0.1.
Latitude_group	mean_fst_by_latitude	Region	mean_fst_by_region	Country	mean_fst_by_country	City	Abbreviation	mean_fst
Above 30N	0.17	East	0.14	Japan	0.17	Kagoshima	KAG	0.15
Above 30N	0.17	East	0.14	Japan	0.17	Kanazawa	KAN	0.20
Above 30N	0.17	East	0.14	Japan	0.17	Utsunomiya	UTS	0.17
Below 30N	0.13	East	0.14	China	0.09	Hainan	HAI	0.09
Below 30N	0.13	East	0.14	China	0.09	Hunan	HUN	0.10
Below 30N	0.13	East	0.14	China	0.09	Yunnan	YUN	0.08
Below 30N	0.13	East	0.14	Japan	0.17	Okinawa	OKI	0.15
Below 30N	0.13	East	0.14	Taiwan	0.21	Tainan	TAI	0.21
Below 30N	0.13	South	0.16	India	0.10	Bengaluru	BEN	0.10
Below 30N	0.13	South	0.16	Maldives	0.23	Kunfunadhoo	KUN	0.23
Below 30N	0.13	South	0.16	Nepal	0.14	Kathmandu	KAT	0.14
Below 30N	0.13	Southeast	0.12	Cambodia	0.08	Phnom Penh	CAM	0.08
Below 30N	0.13	Southeast	0.12	Indonesia	0.17	Jakarta	INJ	0.16
Below 30N	0.13	Southeast	0.12	Indonesia	0.17	Sulawesi (Forest)	SUF	0.17
Below 30N	0.13	Southeast	0.12	Indonesia	0.17	Sulawesi (Urban)	SUU	0.13
Below 30N	0.13	Southeast	0.12	Indonesia	0.17	Wainyapu	INW	0.20
Below 30N	0.13	Southeast	0.12	Malaysia	0.11	Kuala Lumpur	KLP	0.13
Below 30N	0.13	Southeast	0.12	Malaysia	0.11	Tambun	MAT	0.09
Below 30N	0.13	Southeast	0.12	Thailand	0.08	Chanthaburi	CHA	0.08
Below 30N	0.13	Southeast	0.12	Thailand	0.08	Kanchanaburi	KAC	0.08
Below 30N	0.13	Southeast	0.12	Thailand	0.08	Lampang	LAM	0.09
Below 30N	0.13	Southeast	0.12	Thailand	0.08	Sisaket	SSK	0.08
Below 30N	0.13	Southeast	0.12	Vietnam	0.12	Hanoi	HAN	0.10
Below 30N	0.13	Southeast	0.12	Vietnam	0.12	Ho Chi Minh City	HOC	0.10
Below 30N	0.13	Southeast	0.12	Vietnam	0.12	Quy Nhon City	QNC	0.15

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "fst", 
  "fst_r2_0.1_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "fst", "r2_0.1_country.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)

##           HAI      YUN       HUN      OKI       KAN       UTS       KAG
## HAI    0.0000 1035.308  965.7432 2047.021 3269.9948 3527.0034 2509.6023
## YUN 1035.3081    0.000 1107.9313 2677.890 3618.6467 3902.2397 2967.3675
## HUN  965.7432 1107.931    0.0000 1598.628 2531.9353 2811.3205 1859.8916
## OKI 2047.0214 2677.890 1598.6283    0.000 1390.3468 1589.7383  617.8421
## KAN 3269.9948 3618.647 2531.9353 1390.347    0.0000  288.4938  790.9494
## UTS 3527.0034 3902.240 2811.3205 1589.738  288.4938    0.0000 1023.2713
##           TAI      BEN      KUN      KAT       CAM      SUF      SUU      INW
## HAI 1180.1356 3487.311 4241.329 2648.016  987.4089 2614.484 2752.401 3418.919
## YUN 1928.4815 2805.511 3681.074 1640.258 1487.0890 3565.890 3744.433 4320.057
## HUN  985.7472 3912.881 4777.372 2610.708 1929.4809 3405.526 3467.762 4248.204
## OKI  873.2527 5449.847 6261.845 4204.272 2929.3572 3279.912 3153.546 4110.041
## KAN 2185.3348 6409.766 7299.706 4879.798 4223.3171 4620.830 4452.253 5423.669
## UTS 2418.0533 6696.038 7583.178 5167.449 4469.5839 4755.028 4565.653 5539.121
##          INJ      KLP      MAT       SSK      KAC      CHA       LAM       HAN
## HAI 2844.575 1984.417 1861.932  722.0761 1218.270 1086.825 1070.0656  441.2676
## YUN 3467.496 2375.433 2208.208 1087.1720 1178.960 1323.303  714.4744  601.3046
## HUN 3804.791 2932.389 2798.765 1595.2400 1981.221 1955.637 1636.8029  953.5709
## OKI 4294.410 3821.583 3746.517 2758.3859 3265.291 3110.458 3059.7822 2328.1330
## KAN 5684.650 5163.287 5073.181 3990.8289 4456.275 4356.787 4160.1621 3444.0126
## UTS 5874.116 5396.016 5311.404 4248.9076 4721.785 4613.400 4434.9004 3715.0074
##          HOC       QNC
## HAI  990.911  604.4972
## YUN 1626.780 1449.2378
## HUN 1954.276 1565.1505
## OKI 2839.386 2410.1634
## KAN 4160.982 3724.9122
## UTS 4399.493 3964.5166

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

head(data)

##    Distance        FST row_index col_index
## 1    0.0000 0.00000000       HAI       HAI
## 2 1035.3081 0.08802883       HAI       YUN
## 3  965.7432 0.11878133       HAI       HUN
## 4 2047.0214 0.19738113       HAI       OKI
## 5 3269.9948 0.16804359       HAI       KAN
## 6 3527.0034 0.07441617       HAI       UTS

Fit linear regression

# Check for non-positive values in Distance
data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("scripts", "analysis", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14
                                   ))  # Increase font size for y-axis

## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_all_samples_LD2.pdf"),
  width = 5,
  height = 5,
  units = "in"
)

## `geom_smooth()` using formula = 'y ~ x'

Subset by country Select coutries with at least 3 sampling localities

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops

## [1] "China"     "Japan"     "Indonesia" "Thailand"  "Vietnam"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  subset_data <- subset_data[subset_data$Distance > 0, ]
  # Perform linear regression
  lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results

## $China
## $China$equation
## [1] "y = -0.60267x + 4.278"
## 
## $China$r2
## [1] "R^2 = 1.00"
## 
## 
## $Japan
## $Japan$equation
## [1] "y = 0.04552x + -0.120"
## 
## $Japan$r2
## [1] "R^2 = 0.08"
## 
## 
## $Indonesia
## $Indonesia$equation
## [1] "y = -0.05263x + 0.512"
## 
## $Indonesia$r2
## [1] "R^2 = 0.16"
## 
## 
## $Thailand
## $Thailand$equation
## [1] "y = 0.28828x + -1.569"
## 
## $Thailand$r2
## [1] "R^2 = 0.38"
## 
## 
## $Vietnam
## $Vietnam$equation
## [1] "y = -0.05628x + 0.520"
## 
## $Vietnam$r2
## [1] "R^2 = 0.15"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("China", "Japan", "Indonesia", "Thailand", "Vietnam")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate the linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(
    data = regression_results, 
    aes(label = paste(equation, r2, sep = "~~")), 
    x = Inf, y = Inf, 
    vjust = 2, hjust = 1.2, 
    size = 3.5, 
    parse = TRUE, 
    inherit.aes = FALSE
  ) +
  labs(title = "Fst vs Distance by Country",
       x = "Log(Distance)",
       y = "Fst(1-Fst)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_by_distance_countries_LD2.pdf"),
  width = 6,
  height = 8,
  units = "in"
)

## `geom_smooth()` using formula = 'y ~ x'

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)

##     BEN    CAM    CHA    HAI    HAN    HOC    HUN    INJ    INW    KAC   
## BEN NA     "0.03" "0.02" "0.07" "0.07" "0.08" "0.09" "0.14" "0.17" "0.02"
## CAM "2976" NA     "0.00" "0.04" "0.04" "0.05" "0.06" "0.13" "0.16" "0.00"
## CHA "2660" "329"  NA     "0.05" "0.05" "0.06" "0.07" "0.13" "0.16" "0.00"
## HAI "3487" "987"  "1087" NA     "0.04" "0.03" "0.04" "0.15" "0.19" "0.05"
## HAN "3134" "1059" "1019" "441"  NA     "0.05" "0.05" "0.16" "0.22" "0.04"
## HOC "3174" "209"  "537"  "991"  "1146" NA     "0.04" "0.17" "0.21" "0.06"
## HUN "3913" "1929" "1956" "966"  "954"  "1954" NA     "0.16" "0.20" "0.07"
## INJ "3876" "1989" "2160" "2845" "3034" "1889" "3805" NA     "0.07" "0.13"
## INW "5347" "2908" "3190" "3419" "3762" "2727" "4248" "1528" NA     "0.17"
## KAC "2377" "647"  "320"  "1218" "1028" "856"  "1981" "2393" "3492" NA    
## KAG "5773" "3446" "3595" "2510" "2721" "3377" "1860" "4901" "4718" "3713"
## KAN "6410" "4223" "4357" "3270" "3444" "4161" "2532" "5685" "5424" "4456"
## KAT "1828" "2724" "2424" "2648" "2207" "2921" "2611" "4429" "5611" "2119"
## KLP "2869" "1003" "1055" "1984" "2042" "1011" "2932" "1188" "2505" "1234"
## KUN "957"  "3572" "3288" "4241" "3934" "3749" "4777" "3986" "5513" "3052"
## LAM "2419" "951"  "692"  "1070" "733"  "1139" "1637" "2844" "3859" "475" 
## MAT "2751" "875"  "893"  "1862" "1894" "912"  "2799" "1363" "2650" "1060"
## OKI "5450" "2929" "3110" "2047" "2328" "2839" "1599" "4294" "4110" "3265"
## QNC "3424" "528"  "782"  "604"  "882"  "436"  "1565" "2241" "2880" "1047"
## SSK "2894" "402"  "368"  "722"  "678"  "547"  "1595" "2390" "3271" "530" 
## SUF "4970" "2245" "2557" "2614" "2986" "2046" "3406" "1535" "859"  "2874"
## SUU "5307" "2506" "2827" "2752" "3151" "2300" "3468" "1911" "973"  "3147"
## TAI "4631" "2061" "2237" "1180" "1497" "1982" "986"  "3562" "3635" "2397"
## UTS "6696" "4470" "4613" "3527" "3715" "4399" "2811" "5874" "5539" "4722"
## YUN "2806" "1487" "1323" "1035" "601"  "1627" "1108" "3467" "4320" "1179"
##     KAG    KAN    KAT    KLP    KUN    LAM    MAT    OKI    QNC    SSK   
## BEN "0.15" "0.19" "0.11" "0.08" "0.13" "0.03" "0.04" "0.14" "0.10" "0.03"
## CAM "0.12" "0.17" "0.10" "0.05" "0.15" "0.01" "0.02" "0.11" "0.08" "0.01"
## CHA "0.13" "0.18" "0.10" "0.06" "0.15" "0.01" "0.02" "0.12" "0.08" "0.00"
## HAI "0.10" "0.14" "0.13" "0.08" "0.19" "0.05" "0.06" "0.09" "0.12" "0.05"
## HAN "0.12" "0.17" "0.14" "0.09" "0.22" "0.05" "0.06" "0.11" "0.13" "0.05"
## HOC "0.10" "0.15" "0.15" "0.09" "0.21" "0.06" "0.07" "0.10" "0.13" "0.06"
## HUN "0.08" "0.12" "0.15" "0.10" "0.20" "0.07" "0.08" "0.07" "0.14" "0.07"
## INJ "0.22" "0.27" "0.09" "0.17" "0.26" "0.13" "0.14" "0.21" "0.20" "0.13"
## INW "0.26" "0.32" "0.11" "0.24" "0.34" "0.17" "0.18" "0.26" "0.24" "0.16"
## KAC "0.13" "0.18" "0.10" "0.06" "0.17" "0.01" "0.02" "0.12" "0.08" "0.00"
## KAG NA     "0.13" "0.21" "0.17" "0.26" "0.14" "0.14" "0.12" "0.19" "0.13"
## KAN "791"  NA     "0.25" "0.22" "0.32" "0.19" "0.19" "0.17" "0.24" "0.18"
## KAT "4367" "4880" NA     "0.15" "0.23" "0.11" "0.11" "0.20" "0.18" "0.10"
## KLP "4376" "5163" "3242" NA     "0.24" "0.06" "0.06" "0.16" "0.14" "0.06"
## KUN "6631" "7300" "2779" "3196" NA     "0.17" "0.17" "0.26" "0.24" "0.16"
## LAM "3450" "4160" "1789" "1704" "3202" NA     "0.02" "0.12" "0.09" "0.01"
## MAT "4289" "5073" "3069" "177"  "3124" "1531" NA     "0.13" "0.10" "0.02"
## OKI "618"  "1390" "4204" "3822" "6262" "3060" "3747" NA     "0.19" "0.12"
## QNC "2942" "3725" "2922" "1445" "4070" "1155" "1348" "2410" NA     "0.08"
## SSK "3232" "3991" "2412" "1365" "3582" "624"  "1217" "2758" "548"  NA    
## SUF "3893" "4621" "4967" "2115" "5293" "3182" "2220" "3280" "2117" "2567"
## SUU "3756" "4452" "5209" "2469" "5658" "3420" "2565" "3154" "2309" "2797"
## TAI "1397" "2185" "3537" "2979" "5414" "2219" "2894" "873"  "1548" "1886"
## UTS "1023" "288"  "5167" "5396" "7583" "4435" "5311" "1590" "3965" "4249"
## YUN "2967" "3619" "1640" "2375" "3681" "714"  "2208" "2678" "1449" "1087"
##     SUF    SUU    TAI    UTS    YUN    
## BEN "0.15" "0.11" "0.20" "0.17" "0.02" 
## CAM "0.14" "0.09" "0.17" "0.14" "0.00" 
## CHA "0.14" "0.10" "0.18" "0.15" "0.00" 
## HAI "0.16" "0.09" "0.14" "0.11" "0.05" 
## HAN "0.18" "0.11" "0.18" "0.13" "0.04" 
## HOC "0.17" "0.10" "0.16" "0.12" "0.06" 
## HUN "0.17" "0.10" "0.13" "0.09" "0.07" 
## INJ "0.08" "0.09" "0.27" "0.24" "0.13" 
## INW "0.10" "0.13" "0.34" "0.28" "0.16" 
## KAC "0.15" "0.10" "0.19" "0.15" "-0.00"
## KAG "0.23" "0.17" "0.19" "0.09" "0.13" 
## KAN "0.27" "0.21" "0.23" "0.10" "0.18" 
## KAT "0.06" "0.07" "0.26" "0.22" "0.10" 
## KLP "0.20" "0.14" "0.24" "0.19" "0.05" 
## KUN "0.30" "0.25" "0.34" "0.28" "0.16" 
## LAM "0.15" "0.10" "0.19" "0.16" "0.01" 
## MAT "0.15" "0.11" "0.19" "0.16" "0.02" 
## OKI "0.22" "0.16" "0.17" "0.14" "0.12" 
## QNC "0.22" "0.17" "0.25" "0.21" "0.08" 
## SSK "0.14" "0.10" "0.18" "0.15" "0.00" 
## SUF NA     "0.04" "0.29" "0.24" "0.14" 
## SUU "381"  NA     "0.22" "0.18" "0.10" 
## TAI "2777" "2747" NA     "0.20" "0.18" 
## UTS "4755" "4566" "2418" NA     "0.15" 
## YUN "3566" "3744" "1928" "3902" NA

cities <- readRDS(here("output", "populations", "cities.rds"))
head(cities)

## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)

##     SSK    QNC    CAM    HOC    CHA    HAI    HAN    KAC    LAM    YUN   
## SSK NA     "0.08" "0.01" "0.06" "0.00" "0.05" "0.05" "0.00" "0.01" "0.00"
## QNC "548"  NA     "0.08" "0.13" "0.08" "0.12" "0.13" "0.08" "0.09" "0.08"
## CAM "402"  "528"  NA     "0.05" "0.00" "0.04" "0.04" "0.00" "0.01" "0.00"
## HOC "547"  "436"  "209"  NA     "0.06" "0.03" "0.05" "0.06" "0.06" "0.06"
## CHA "368"  "782"  "329"  "537"  NA     "0.05" "0.05" "0.00" "0.01" "0.00"
## HAI "722"  "604"  "987"  "991"  "1087" NA     "0.04" "0.05" "0.05" "0.05"
## HAN "678"  "882"  "1059" "1146" "1019" "441"  NA     "0.04" "0.05" "0.04"
## KAC "530"  "1047" "647"  "856"  "320"  "1218" "1028" NA     "0.01" "0.00"
## LAM "624"  "1155" "951"  "1139" "692"  "1070" "733"  "475"  NA     "0.01"
## YUN "1087" "1449" "1487" "1627" "1323" "1035" "601"  "1179" "714"  NA    
## MAT "1217" "1348" "875"  "912"  "893"  "1862" "1894" "1060" "1531" "2208"
## HUN "1595" "1565" "1929" "1954" "1956" "966"  "954"  "1981" "1637" "1108"
## KLP "1365" "1445" "1003" "1011" "1055" "1984" "2042" "1234" "1704" "2375"
## TAI "1886" "1548" "2061" "1982" "2237" "1180" "1497" "2397" "2219" "1928"
## OKI "2758" "2410" "2929" "2839" "3110" "2047" "2328" "3265" "3060" "2678"
## SUF "2567" "2117" "2245" "2046" "2557" "2614" "2986" "2874" "3182" "3566"
## INJ "2390" "2241" "1989" "1889" "2160" "2845" "3034" "2393" "2844" "3467"
## SUU "2797" "2309" "2506" "2300" "2827" "2752" "3151" "3147" "3420" "3744"
## KAT "2412" "2922" "2724" "2921" "2424" "2648" "2207" "2119" "1789" "1640"
## KAG "3232" "2942" "3446" "3377" "3595" "2510" "2721" "3713" "3450" "2967"
## INW "3271" "2880" "2908" "2727" "3190" "3419" "3762" "3492" "3859" "4320"
## BEN "2894" "3424" "2976" "3174" "2660" "3487" "3134" "2377" "2419" "2806"
## KAN "3991" "3725" "4223" "4161" "4357" "3270" "3444" "4456" "4160" "3619"
## UTS "4249" "3965" "4470" "4399" "4613" "3527" "3715" "4722" "4435" "3902"
## KUN "3582" "4070" "3572" "3749" "3288" "4241" "3934" "3052" "3202" "3681"
##     MAT    HUN    KLP    TAI    OKI    SUF    INJ    SUU    KAT    KAG   
## SSK "0.02" "0.07" "0.06" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## QNC "0.10" "0.14" "0.14" "0.25" "0.19" "0.22" "0.20" "0.17" "0.18" "0.19"
## CAM "0.02" "0.06" "0.05" "0.17" "0.11" "0.14" "0.13" "0.09" "0.10" "0.12"
## HOC "0.07" "0.04" "0.09" "0.16" "0.10" "0.17" "0.17" "0.10" "0.15" "0.10"
## CHA "0.02" "0.07" "0.06" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## HAI "0.06" "0.04" "0.08" "0.14" "0.09" "0.16" "0.15" "0.09" "0.13" "0.10"
## HAN "0.06" "0.05" "0.09" "0.18" "0.11" "0.18" "0.16" "0.11" "0.14" "0.12"
## KAC "0.02" "0.07" "0.06" "0.19" "0.12" "0.15" "0.13" "0.10" "0.10" "0.13"
## LAM "0.02" "0.07" "0.06" "0.19" "0.12" "0.15" "0.13" "0.10" "0.11" "0.14"
## YUN "0.02" "0.07" "0.05" "0.18" "0.12" "0.14" "0.13" "0.10" "0.10" "0.13"
## MAT NA     "0.08" "0.06" "0.19" "0.13" "0.15" "0.14" "0.11" "0.11" "0.14"
## HUN "2799" NA     "0.10" "0.13" "0.07" "0.17" "0.16" "0.10" "0.15" "0.08"
## KLP "177"  "2932" NA     "0.24" "0.16" "0.20" "0.17" "0.14" "0.15" "0.17"
## TAI "2894" "986"  "2979" NA     "0.17" "0.29" "0.27" "0.22" "0.26" "0.19"
## OKI "3747" "1599" "3822" "873"  NA     "0.22" "0.21" "0.16" "0.20" "0.12"
## SUF "2220" "3406" "2115" "2777" "3280" NA     "0.08" "0.04" "0.06" "0.23"
## INJ "1363" "3805" "1188" "3562" "4294" "1535" NA     "0.09" "0.09" "0.22"
## SUU "2565" "3468" "2469" "2747" "3154" "381"  "1911" NA     "0.07" "0.17"
## KAT "3069" "2611" "3242" "3537" "4204" "4967" "4429" "5209" NA     "0.21"
## KAG "4289" "1860" "4376" "1397" "618"  "3893" "4901" "3756" "4367" NA    
## INW "2650" "4248" "2505" "3635" "4110" "859"  "1528" "973"  "5611" "4718"
## BEN "2751" "3913" "2869" "4631" "5450" "4970" "3876" "5307" "1828" "5773"
## KAN "5073" "2532" "5163" "2185" "1390" "4621" "5685" "4452" "4880" "791" 
## UTS "5311" "2811" "5396" "2418" "1590" "4755" "5874" "4566" "5167" "1023"
## KUN "3124" "4777" "3196" "5414" "6262" "5293" "3986" "5658" "2779" "6631"
##     INW    BEN    KAN    UTS    KUN   
## SSK "0.16" "0.03" "0.18" "0.15" "0.16"
## QNC "0.24" "0.10" "0.24" "0.21" "0.24"
## CAM "0.16" "0.03" "0.17" "0.14" "0.15"
## HOC "0.21" "0.08" "0.15" "0.12" "0.21"
## CHA "0.16" "0.02" "0.18" "0.15" "0.15"
## HAI "0.19" "0.07" "0.14" "0.11" "0.19"
## HAN "0.22" "0.07" "0.17" "0.13" "0.22"
## KAC "0.17" "0.02" "0.18" "0.15" "0.17"
## LAM "0.17" "0.03" "0.19" "0.16" "0.17"
## YUN "0.16" "0.02" "0.18" "0.15" "0.16"
## MAT "0.18" "0.04" "0.19" "0.16" "0.17"
## HUN "0.20" "0.09" "0.12" "0.09" "0.20"
## KLP "0.24" "0.08" "0.22" "0.19" "0.24"
## TAI "0.34" "0.20" "0.23" "0.20" "0.34"
## OKI "0.26" "0.14" "0.17" "0.14" "0.26"
## SUF "0.10" "0.15" "0.27" "0.24" "0.30"
## INJ "0.07" "0.14" "0.27" "0.24" "0.26"
## SUU "0.13" "0.11" "0.21" "0.18" "0.25"
## KAT "0.11" "0.11" "0.25" "0.22" "0.23"
## KAG "0.26" "0.15" "0.13" "0.09" "0.26"
## INW NA     "0.17" "0.32" "0.28" "0.34"
## BEN "5347" NA     "0.19" "0.17" "0.13"
## KAN "5424" "6410" NA     "0.10" "0.32"
## UTS "5539" "6696" "288"  NA     "0.28"
## KUN "5513" "957"  "7300" "7583" NA

Make a table and save as word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]


# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df))

ft

Population	SSK	QNC	CAM	HOC	CHA	HAI	HAN	KAC	LAM	YUN	MAT	HUN	KLP	TAI	OKI	SUF	INJ	SUU	KAT	KAG	INW	BEN	KAN	UTS	KUN
SSK		0.08	0.01	0.06	0.00	0.05	0.05	0.00	0.01	0.00	0.02	0.07	0.06	0.18	0.12	0.14	0.13	0.10	0.10	0.13	0.16	0.03	0.18	0.15	0.16
QNC	548		0.08	0.13	0.08	0.12	0.13	0.08	0.09	0.08	0.10	0.14	0.14	0.25	0.19	0.22	0.20	0.17	0.18	0.19	0.24	0.10	0.24	0.21	0.24
CAM	402	528		0.05	0.00	0.04	0.04	0.00	0.01	0.00	0.02	0.06	0.05	0.17	0.11	0.14	0.13	0.09	0.10	0.12	0.16	0.03	0.17	0.14	0.15
HOC	547	436	209		0.06	0.03	0.05	0.06	0.06	0.06	0.07	0.04	0.09	0.16	0.10	0.17	0.17	0.10	0.15	0.10	0.21	0.08	0.15	0.12	0.21
CHA	368	782	329	537		0.05	0.05	0.00	0.01	0.00	0.02	0.07	0.06	0.18	0.12	0.14	0.13	0.10	0.10	0.13	0.16	0.02	0.18	0.15	0.15
HAI	722	604	987	991	1087		0.04	0.05	0.05	0.05	0.06	0.04	0.08	0.14	0.09	0.16	0.15	0.09	0.13	0.10	0.19	0.07	0.14	0.11	0.19
HAN	678	882	1059	1146	1019	441		0.04	0.05	0.04	0.06	0.05	0.09	0.18	0.11	0.18	0.16	0.11	0.14	0.12	0.22	0.07	0.17	0.13	0.22
KAC	530	1047	647	856	320	1218	1028		0.01	0.00	0.02	0.07	0.06	0.19	0.12	0.15	0.13	0.10	0.10	0.13	0.17	0.02	0.18	0.15	0.17
LAM	624	1155	951	1139	692	1070	733	475		0.01	0.02	0.07	0.06	0.19	0.12	0.15	0.13	0.10	0.11	0.14	0.17	0.03	0.19	0.16	0.17
YUN	1087	1449	1487	1627	1323	1035	601	1179	714		0.02	0.07	0.05	0.18	0.12	0.14	0.13	0.10	0.10	0.13	0.16	0.02	0.18	0.15	0.16
MAT	1217	1348	875	912	893	1862	1894	1060	1531	2208		0.08	0.06	0.19	0.13	0.15	0.14	0.11	0.11	0.14	0.18	0.04	0.19	0.16	0.17
HUN	1595	1565	1929	1954	1956	966	954	1981	1637	1108	2799		0.10	0.13	0.07	0.17	0.16	0.10	0.15	0.08	0.20	0.09	0.12	0.09	0.20
KLP	1365	1445	1003	1011	1055	1984	2042	1234	1704	2375	177	2932		0.24	0.16	0.20	0.17	0.14	0.15	0.17	0.24	0.08	0.22	0.19	0.24
TAI	1886	1548	2061	1982	2237	1180	1497	2397	2219	1928	2894	986	2979		0.17	0.29	0.27	0.22	0.26	0.19	0.34	0.20	0.23	0.20	0.34
OKI	2758	2410	2929	2839	3110	2047	2328	3265	3060	2678	3747	1599	3822	873		0.22	0.21	0.16	0.20	0.12	0.26	0.14	0.17	0.14	0.26
SUF	2567	2117	2245	2046	2557	2614	2986	2874	3182	3566	2220	3406	2115	2777	3280		0.08	0.04	0.06	0.23	0.10	0.15	0.27	0.24	0.30
INJ	2390	2241	1989	1889	2160	2845	3034	2393	2844	3467	1363	3805	1188	3562	4294	1535		0.09	0.09	0.22	0.07	0.14	0.27	0.24	0.26
SUU	2797	2309	2506	2300	2827	2752	3151	3147	3420	3744	2565	3468	2469	2747	3154	381	1911		0.07	0.17	0.13	0.11	0.21	0.18	0.25
KAT	2412	2922	2724	2921	2424	2648	2207	2119	1789	1640	3069	2611	3242	3537	4204	4967	4429	5209		0.21	0.11	0.11	0.25	0.22	0.23
KAG	3232	2942	3446	3377	3595	2510	2721	3713	3450	2967	4289	1860	4376	1397	618	3893	4901	3756	4367		0.26	0.15	0.13	0.09	0.26
INW	3271	2880	2908	2727	3190	3419	3762	3492	3859	4320	2650	4248	2505	3635	4110	859	1528	973	5611	4718		0.17	0.32	0.28	0.34
BEN	2894	3424	2976	3174	2660	3487	3134	2377	2419	2806	2751	3913	2869	4631	5450	4970	3876	5307	1828	5773	5347		0.19	0.17	0.13
KAN	3991	3725	4223	4161	4357	3270	3444	4456	4160	3619	5073	2532	5163	2185	1390	4621	5685	4452	4880	791	5424	6410		0.10	0.32
UTS	4249	3965	4470	4399	4613	3527	3715	4722	4435	3902	5311	2811	5396	2418	1590	4755	5874	4566	5167	1023	5539	6696	288		0.28
KUN	3582	4070	3572	3749	3288	4241	3934	3052	3202	3681	3124	4777	3196	5414	6262	5293	3986	5658	2779	6631	5513	957	7300	7583

# Create a new Word document
doc <- read_docx()

# Add the flextable to the Word document
doc <- body_add_flextable(doc, ft)

# Save the Word document
print(doc, target =  here("output", "fst", "LD2.docx"))

4. Comparison of the 3 sets

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()

##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  6194515 330.9   11233262 600.0         NA 11233262 600.0
## Vcells 10497539  80.1   18029751 137.6      32768 14958126 114.2

Import the data

Intergenic

intergenic <- readRDS(
  here(
    "output", "fst", "neutral_country.rds"
  )
)
head(intergenic)

## # A tibble: 6 × 2
##   Country   mean_fst
##   <chr>        <dbl>
## 1 Cambodia      0.05
## 2 China         0.07
## 3 India         0.07
## 4 Indonesia     0.11
## 5 Japan         0.12
## 6 Malaysia      0.09

LD1

LD1 <- readRDS(
  here(
    "output", "fst", "r2_0.01_country.rds"
  )
)
head(LD1)

## # A tibble: 6 × 2
##   Country   mean_fst
##   <chr>        <dbl>
## 1 Cambodia      0.08
## 2 China         0.09
## 3 India         0.1 
## 4 Indonesia     0.16
## 5 Japan         0.17
## 6 Malaysia      0.11

LD2

LD2 <- readRDS(
  here(
    "output", "fst", "r2_0.1_country.rds"
  )
)
head(LD2)

## # A tibble: 6 × 2
##   Country   mean_fst
##   <chr>        <dbl>
## 1 Cambodia      0.08
## 2 China         0.09
## 3 India         0.1 
## 4 Indonesia     0.17
## 5 Japan         0.17
## 6 Malaysia      0.11

Merge the data sets

# Add an identifier column to each dataset
intergenic$dataset <- "intergenic"
LD1$dataset <- "LD1"
LD2$dataset <- "LD2"

# Merge the datasets
combined_data <- rbind(intergenic, LD1, LD2)

# Assign a numeric index for plotting based on ordered countries
combined_data <- combined_data %>% 
  arrange(Country) %>%
  group_by(Country) %>%
  mutate(index = dense_rank(Country))

# Custom color mapping
color_mapping <- scale_color_manual(
  values = c(intergenic = "gray", LD1 = "blue", LD2 = "orange"),
  name = "Dataset"
)


# Shapes
shape_mapping <- scale_shape_manual(
  values = c(intergenic = 16, LD1 = 17, LD2 = 18),
  name = "Dataset"
)


# Scatter plot with custom colors and shapes
plot <- ggplot(combined_data, aes(x = Country, y = mean_fst, color = dataset, shape = dataset)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE, aes(group = dataset)) +  # Fitted line per dataset
  labs(
    title = "",
    x = "Countries",
    y = "Mean Fst"
  ) +
  color_mapping +
  shape_mapping +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "top")

# Display the plot
print(plot)

## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "fst", "fst_datasets_country.pdf"),
  plot, 
  width = 7,
  height = 5,
  units = "in"
)

## `geom_smooth()` using formula = 'y ~ x'

5. Mantel test with LD2

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/populations/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--make-bed \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log

Then make sure to edit the line “BEN a 0 0 0 -9” to “BEN b 0 0 0 -9”

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/mantel \
--recodeA \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log

Import the daat and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("output", "fst", "mantel.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

saveRDS(albo2, here(
  "output", "fst", "albo2.rds"
))

Load it

albo2 <- readRDS(here(
  "output", "fst", "albo2.rds"
))

Import sample locations

sampling_loc <- readRDS(here("output", "populations", "cities.rds"))
head(sampling_loc)

## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN

The fam file

fam_file <- here(
  "output", "fst", "mantel.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>% 
#   mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      OKI         1001          0          0   2        -9  Okinawa   Japan
## 2      OKI         1002          0          0   2        -9  Okinawa   Japan
## 3      OKI         1003          0          0   2        -9  Okinawa   Japan
## 4      OKI         1004          0          0   2        -9  Okinawa   Japan
## 5      OKI         1005          0          0   2        -9  Okinawa   Japan
## 6      OKI         1006          0          0   1        -9  Okinawa   Japan
##   Latitude Longitude Region
## 1  26.5013  127.9454   Asia
## 2  26.5013  127.9454   Asia
## 3  26.5013  127.9454   Asia
## 4  26.5013  127.9454   Asia
## 5  26.5013  127.9454   Asia
## 6  26.5013  127.9454   Asia

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)

##         [,1]     [,2]
## [1,] 26.5013 127.9454
## [2,] 26.5013 127.9454
## [3,] 26.5013 127.9454
## [4,] 26.5013 127.9454
## [5,] 26.5013 127.9454
## [6,] 26.5013 127.9454

colnames(albo_dist2)<- c("x","y")

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)

##             x        y
## [1,] 26.51917 127.9537
## [2,] 26.51278 127.9527
## [3,] 26.52068 127.9563
## [4,] 26.52102 127.9391
## [5,] 26.51013 127.9400
## [6,] 26.51191 127.9547

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "output", "fst", "albo2.rds"
  )
)

Isolation by distance

Convert the data

toto <- genind2genpop(albo2)

## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd

## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: 0.225538 
## 
## Based on 999 replicates
## Simulated p-value: 0.074 
## Alternative hypothesis: greater 
## 
##     Std.Obs Expectation    Variance 
## 1.504698485 0.005512092 0.021382012

Explanations: Observation: This is the observed Mantel statistic for your data, which is 0.225605. It represents the correlation between the two matrices. A positive value indicates a positive correlation, meaning as one distance increases, so does the other.

Monte-Carlo Test: This test involves repeatedly randomizing one of the matrices and calculating the Mantel statistic for each randomized version. This process creates a distribution of Mantel statistics under the null hypothesis (which usually states that there is no association between the matrices).

Simulated p-value: The p-value is 0.059. This value represents the proportion of replicates in the Monte-Carlo test where the Mantel statistic was greater than or equal to the observed statistic (0.225605). A p-value close to 0.05 (like ours, 0.059) suggests that the observed correlation might not be due to random chance, but it’s not conventionally ‘significant’ as it’s slightly above the typical cutoff of 0.05.

Alternative Hypothesis: greater: This indicates that the test was one-sided, testing whether the observed Mantel statistic is greater than what would be expected by chance.

Std.Obs (Standardized Observation): This is your observed Mantel statistic standardized (1.626624021) using the expectation and variance from the Monte-Carlo test.

Expectation and Variance: These are the mean (-0.003941264) and variance (0.019914350) of the Mantel statistic under the null hypothesis, calculated from the Monte-Carlo replicates.

Interpretation: The observed Mantel statistic (0.225605) suggests a moderate positive correlation between genetic and geographic distances. The p-value of 0.059 is close to 0.05, indicating a trend towards significance, but not strong enough to confidently reject the null hypothesis of no correlation. The test was specifically looking to see if the genetic distance increases with geographic distance (one-sided test). The standardized observation (Std.Obs) being positive and relatively large suggests that the observed correlation is somewhat higher than what would be expected by chance.

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "fst", "sim.pdf"))
plot(ibd)
dev.off()

## quartz_off_screen 
##                 2

plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Add the equation

# Plotting the data
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
# You can adjust the position (e.g., x and y values) as necessary
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.9, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.85, labels = r2_label)

Use library MASS for plot

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
png(here("output", "fst", "ibd2.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.5, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.45, labels = r2_label)

title("Isolation by distance")
dev.off()

## quartz_off_screen 
##                 2

Check the populations - I did not include those with less than 4 mosquitoes

summary(albo2$pop)

## BEN CAM CHA HAI HAN HOC HUN INJ INW KAC KAG KAN KAT KLP KUN LAM MAT OKI QNC SSK 
##  12  12  12  12   4   7  12  11   4   6  12  11  10   4   4   9  12  11  12  12 
## SUF SUU TAI UTS YUN 
##   6   6   8  12   9

Aedes albopictus SNP chip - FST estimates

Luciano V Cosme

2023-11-18

Libraries

1. Intergenic SNPs

2. SNP set r2 0.01

3. SNP set r2 0.1

4. Comparison of the 3 sets

5. Mantel test with LD2