A. Get neuro-admixture and set up a run to test it

The software is available at https://github.com/ai-sandbox/neural-admixture

On the HPC we first need to create an interactive session

# Create and navigate to the neuroadmixture directory
cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;

# Create interactive session with 1 CPUs
salloc --partition devel --time=06:00:00 --nodes=1 --ntasks=1 --cpus-per-task=4 --mem-per-cpu=5120 zsh;

# Load miniconda (do not use the default it did not work for me)
module load miniconda/23.1.0

# Create a conda environment
conda create --name nadmenv python=3.9
# environment location: /gpfs/gibbs/project/powell/lvc26/conda_envs/nadmenv

# Activate the new env
conda activate nadmenv

# Install neuro-admixture
pip install neural-admixture --user

# To deactivate it later
conda deactivate

# If you want to remove it
# conda env remove -n nadmenv

source /vast/palmer/apps/avx2/software/miniconda/23.1.0/etc/profile.d/conda.sh;
module load miniconda/23.1.0; 
conda activate nadmenv; 
export PYTHONPATH=/home/lvc26/project/conda_envs/nadmenv

On the mac laptop

pip install neural-admixture
export KMP_DUPLICATE_LIB_OK=TRUE

neural-admixture train --help

Libraries

library(tidyverse)
library(here)
library(colorout)
library(dplyr)
library(ggplot2)
library(extrafont)

1. Train with r2 0.01 filter

Prune the data

plink2 \
--allow-extra-chr \
--bfile output/populations/file7 \
--indep-pairwise 5 1 0.01 \
--out output/populations/nadmix/indepSNP \
--silent;
grep 'pairwise\|variants\|samples' output/populations/nadmix/indepSNP.log
##   --indep-pairwise 5 1 0.01
## 237 samples (30 females, 67 males, 140 ambiguous; 237 founders) loaded from
## 82731 variants loaded from output/populations/file7.bim.
## --indep-pairwise (3 compute threads): 61277/82731 variants removed.

Look at the Admixture plot with k=5 and choose the populations with individuals with single ancestry or low admixture (more than 50% of single ancestry)

color_palette <-
  c(
    "v1" = "#00FF00", # KAN UTS
    "v2" = "#FFFF00", # YUN BEN
    "v3" = "#0000FF", # INJ INW
    "v4" = "#FF00FF", # QNC
    "v5" = "#FF0000"  # TAI OKI
  )

Create a new bed file for training. Then we use the 60 SNPs with all populations to test neuro-admixture

echo "KAN
UTS
YUN
BEN
INJ
INW
QNC
TAI
OKI
" > output/populations/nadmix/pops_2_train.txt

Subset with plink

plink \
--keep-allele-order \
--keep-fam output/populations/nadmix/pops_2_train.txt \
--bfile output/populations/file7 \
--maf 0.1 \
--make-bed \
--out output/populations/nadmix/train/train_r_0.01 \
--extract output/populations/nadmix/indepSNP.prune.in \
--geno 0.2 \
--silent \
--write-snplist
grep "samples\|variants" output/populations/nadmix/train/train_r_0.01.log 
## 82731 variants loaded from .bim file.
## --extract: 21454 variants remaining.
## Total genotyping rate in remaining samples is 0.969198.
## 11 variants removed due to missing genotype data (--geno).
## 512 variants removed due to minor allele threshold(s)
## 20931 variants and 79 people pass filters and QC.

Transfer data to cluster (all the data sets are here) - files for inference

rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/train lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix

We can also transfer the files for training

rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/snps_sets lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix

Train neuroadmix with a range of ks Get two gpu/cpus

salloc --cpus-per-gpu=2 --gpus=2 --time=30:00 --partition gpu_devel

Unsupervised (multi-head)

We run admixture with these populations and the recommended k=4 instead of k=5. It is probably because KAN shows admixture with KAG. Therefore, we can train neuro-admixture with k=4 and see if it will make the same assignments as admixture did.

cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;
module load miniconda/23.1.0;
conda activate nadmenv;

# pckmeans r_0.01
neural-admixture train --seed 1234 --initialization pckmeans --warmup_epochs 1000 --max_epochs 1000 --activation relu --optimizer adam --learning_rate 1e-7 --min_k 2 --max_k 10 --name r_0.01 --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/train/train_r_0.01.bed --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.01 | tee r_0.01.log

Download the data

rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.01 /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results

1.1. Plot after training

Clear memory and environment

# Clear entire environment
rm(list = ls())
# Forcefully trigger garbage collection
gc()
##           used (Mb) gc trigger  (Mb) limit (Mb) max used (Mb)
## Ncells  978096 52.3    1979700 105.8         NA  1295087 69.2
## Vcells 1699816 13.0    8388608  64.0      32768  2166040 16.6
# Extract ancestry coefficients
nadmixk5 <- read_delim(
  here("output", "populations", "nadmix", "results", "r_0.01","r_0.01.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(nadmixk5)
## # A tibble: 6 × 5
##        X1      X2    X3       X4      X5
##     <dbl>   <dbl> <dbl>    <dbl>   <dbl>
## 1 0.0250  0.0177  0.914 0.0217   0.0219 
## 2 0.0493  0.0223  0.901 0.000802 0.0264 
## 3 0.0374  0.0207  0.923 0.000681 0.0186 
## 4 0.00242 0.00668 0.975 0.0115   0.00489
## 5 0.00159 0.00471 0.982 0.00858  0.00323
## 6 0.0348  0.0217  0.923 0.000646 0.0197

The fam file

fam_file <- here(
  "output", "populations", "nadmix", "train","train_r_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

nadmixk5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(nadmixk5)

head(nadmixk5)
##    ind pop          X1          X2        X3           X4          X5
## 1 1001 OKI 0.025028508 0.017738435 0.9136265 0.0217335448 0.021873066
## 2 1002 OKI 0.049260873 0.022308363 0.9012038 0.0008023476 0.026424598
## 3 1003 OKI 0.037390847 0.020651398 0.9226472 0.0006807603 0.018629761
## 4 1004 OKI 0.002420123 0.006682593 0.9745043 0.0115027474 0.004890267
## 5 1005 OKI 0.001587119 0.004713184 0.9818867 0.0085788695 0.003234004
## 6 1006 OKI 0.034804370 0.021673145 0.9232054 0.0006456328 0.019671431

Rename the columns

# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(nadmixk5)
##    ind pop          v1          v2        v3           v4          v5
## 1 1001 OKI 0.025028508 0.017738435 0.9136265 0.0217335448 0.021873066
## 2 1002 OKI 0.049260873 0.022308363 0.9012038 0.0008023476 0.026424598
## 3 1003 OKI 0.037390847 0.020651398 0.9226472 0.0006807603 0.018629761
## 4 1004 OKI 0.002420123 0.006682593 0.9745043 0.0115027474 0.004890267
## 5 1005 OKI 0.001587119 0.004713184 0.9818867 0.0085788695 0.003234004
## 6 1006 OKI 0.034804370 0.021673145 0.9232054 0.0006456328 0.019671431

Import sample locations

sampling_loc <- readRDS(here("output", "local_adaptation", "sampling_loc.rds"))
head(sampling_loc)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN
source(
  here(
    "scripts", "analysis", "my_theme3.R"
  )
)

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Melt the data frame for plotting
# Q_melted <- melt(nadmixk5, id.vars = c("ind", "pop"))
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(Region, Region2, Country, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)

color_palette <-
  c(
    "v1" = "#00FF00",
    "v2" = "#FFFF00",
    "v3" = "#FF0000",
    "v4" = "#0000FF",
    "v5" = "#FF00FF"
  )

# Generate all potential variable names
all_variables <- paste0("v", 1:5)

# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
                            color = names(color_palette))

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture training k5.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

# # save it
ggsave(
  here("output", "populations", "figures", "neuro_admixture_k=5_r_0.01_trained.pdf"),
  width  = 6,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

1.2 Inference with all populations

Now we can use all populations but extract the SNPs we trained

plink \
--keep-allele-order \
--bfile output/populations/file7 \
--make-bed \
--export vcf \
--out output/populations/snps_sets/r2_0.01 \
--extract output/populations/nadmix/train/train_r_0.01.snplist \
--silent
grep "samples\|variants" output/populations/snps_sets/r2_0.01.log 
## 82731 variants loaded from .bim file.
## --extract: 21095 variants remaining.
## 21095 variants and 237 people pass filters and QC.

Transfer data to cluster

rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/

Inference mode (projective analysis)

neural-admixture infer --name r_0.01 --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.01 --out_name r_0.01_inference --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/r2_0.01.bed | tee r_0.01_inference.log

1.3 Plot after inference with all populations

Download the data

rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.01 /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results

1.4 Plot inference

# Extract ancestry coefficients
nadmixk5 <- read_delim(
  here("output", "populations", "nadmix", "results", "r_0.01","r_0.01_inference.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
##        X1      X2    X3       X4      X5
##     <dbl>   <dbl> <dbl>    <dbl>   <dbl>
## 1 0.0242  0.0176  0.913 0.0229   0.0218 
## 2 0.0466  0.0223  0.903 0.000826 0.0268 
## 3 0.0350  0.0206  0.925 0.000711 0.0188 
## 4 0.00230 0.00665 0.974 0.0120   0.00482
## 5 0.00154 0.00487 0.981 0.00912  0.00319
## 6 0.0317  0.0212  0.927 0.000650 0.0195

The fam file

fam_file <- here(
  "output", "populations", "snps_sets", "r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

nadmixk5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(nadmixk5)

head(nadmixk5)
##    ind pop          X1          X2        X3           X4          X5
## 1 1001 OKI 0.024173928 0.017611718 0.9134678 0.0229028091 0.021843690
## 2 1002 OKI 0.046582617 0.022330839 0.9034133 0.0008259051 0.026847320
## 3 1003 OKI 0.035011258 0.020629292 0.9248377 0.0007113857 0.018810373
## 4 1004 OKI 0.002296278 0.006649634 0.9742270 0.0120037850 0.004823298
## 5 1005 OKI 0.001538968 0.004871406 0.9812847 0.0091172550 0.003187769
## 6 1006 OKI 0.031735279 0.021164291 0.9269638 0.0006500665 0.019486537

Rename the columns

# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(nadmixk5)
##    ind pop          v1          v2        v3           v4          v5
## 1 1001 OKI 0.024173928 0.017611718 0.9134678 0.0229028091 0.021843690
## 2 1002 OKI 0.046582617 0.022330839 0.9034133 0.0008259051 0.026847320
## 3 1003 OKI 0.035011258 0.020629292 0.9248377 0.0007113857 0.018810373
## 4 1004 OKI 0.002296278 0.006649634 0.9742270 0.0120037850 0.004823298
## 5 1005 OKI 0.001538968 0.004871406 0.9812847 0.0091172550 0.003187769
## 6 1006 OKI 0.031735279 0.021164291 0.9269638 0.0006500665 0.019486537

Import sample locations

sampling_loc <- readRDS(here("output", "local_adaptation", "sampling_loc.rds"))
head(sampling_loc)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN
source(
  here(
    "scripts", "analysis", "my_theme3.R"
  )
)

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(Region, Region2, Country, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)

color_palette <-
  c(
    "v1" = "#00FF00",
    "v2" = "#FFFF00",
    "v3" = "#FF0000",
    "v4" = "#0000FF",
    "v5" = "#FF00FF" 
  )

# Generate all potential variable names
all_variables <- paste0("v", 1:5)

# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
                            color = names(color_palette))

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture inference for k5 with 20,931 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "populations", "figures", "neuro_admixture_k=5_r_0.01_inference.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2. Train neuro admixture with SNPS r2 0.1

Subset with plink

plink \
--keep-allele-order \
--keep-fam output/populations/nadmix/pops_2_train.txt \
--bfile output/populations/file7 \
--maf 0.1 \
--make-bed \
--out output/populations/nadmix/train/train_r_0.1 \
--extract output/populations/indepSNP_chr.prune.in \
--geno 0.2 \
--silent \
--write-snplist
# 57k SNPs
grep "variants\|people" output/populations/nadmix/train/train_r_0.1.log
## 82731 variants loaded from .bim file.
## 237 people (67 males, 30 females, 140 ambiguous) loaded from .fam.
## --extract: 60988 variants remaining.
## --keep-fam: 90 people remaining.
## 0 variants removed due to missing genotype data (--geno).
## 2836 variants removed due to minor allele threshold(s)
## 58152 variants and 90 people pass filters and QC.

Train with the pckmeans initialization

Get two gpu/cpus

salloc --cpus-per-gpu=2 --gpus=2 --time=30:00 --partition gpu_devel

Train

cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;
module load miniconda/23.1.0;
conda activate nadmenv;

# pckmeans r_0.1
neural-admixture train --seed 1234 --initialization pckmeans --warmup_epochs 1000 --max_epochs 1000 --activation relu --optimizer adam --learning_rate 1e-7 --min_k 2 --max_k 10 --name r_0.1 --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/train/train_r_0.1.bed --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.1 | tee r_0.1.log

Download the data

rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.1 /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results

2.1. Plot after training

# Extract ancestry coefficients
nadmixk5 <- read_delim(
  here("output", "populations", "nadmix", "results", "r_0.1","r_0.1.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 


head(nadmixk5)
## # A tibble: 6 × 5
##          X1      X2       X3    X4        X5
##       <dbl>   <dbl>    <dbl> <dbl>     <dbl>
## 1 0.0325    0.0140  0.00322  0.939 0.0118   
## 2 0.000184  0.0173  0.000650 0.978 0.00368  
## 3 0.0000777 0.00645 0.000763 0.990 0.00281  
## 4 0.00225   0.00527 0.000948 0.991 0.0000867
## 5 0.00152   0.00364 0.000695 0.994 0.0000509
## 6 0.000231  0.00756 0.00138  0.989 0.00146

The fam file

fam_file <- here(
  "output", "populations", "nadmix", "train","train_r_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

nadmixk5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(nadmixk5)

head(nadmixk5)
##    ind pop           X1          X2           X3        X4           X5
## 1 1001 OKI 0.0324859358 0.013972007 0.0032239752 0.9385149 1.180321e-02
## 2 1002 OKI 0.0001844297 0.017312648 0.0006498906 0.9781768 3.676144e-03
## 3 1003 OKI 0.0000777165 0.006451267 0.0007630020 0.9898998 2.808214e-03
## 4 1004 OKI 0.0022484392 0.005267162 0.0009479558 0.9914498 8.670188e-05
## 5 1005 OKI 0.0015169261 0.003644208 0.0006946810 0.9940934 5.088000e-05
## 6 1006 OKI 0.0002308622 0.007557554 0.0013819213 0.9893672 1.462476e-03

Rename the columns

# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(nadmixk5)
##    ind pop           v1          v2           v3        v4           v5
## 1 1001 OKI 0.0324859358 0.013972007 0.0032239752 0.9385149 1.180321e-02
## 2 1002 OKI 0.0001844297 0.017312648 0.0006498906 0.9781768 3.676144e-03
## 3 1003 OKI 0.0000777165 0.006451267 0.0007630020 0.9898998 2.808214e-03
## 4 1004 OKI 0.0022484392 0.005267162 0.0009479558 0.9914498 8.670188e-05
## 5 1005 OKI 0.0015169261 0.003644208 0.0006946810 0.9940934 5.088000e-05
## 6 1006 OKI 0.0002308622 0.007557554 0.0013819213 0.9893672 1.462476e-03

Import sample locations

sampling_loc <- readRDS(here("output", "local_adaptation", "sampling_loc.rds"))
head(sampling_loc)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN
source(
  here(
    "scripts", "analysis", "my_theme3.R"
  )
)

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Melt the data frame for plotting
# Q_melted <- melt(nadmixk5, id.vars = c("ind", "pop"))
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(Region, Region2, Country, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)

color_palette <-
  c(
    "v1" = "#00FF00",
    "v2" = "#0000FF",
    "v3" = "#FFFF00",
    "v4" = "#FF0000",
    "v5" = "#FF00FF" 
  )

# Generate all potential variable names
all_variables <- paste0("v", 1:5)

# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
                            color = names(color_palette))

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture training k5.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

# # # save it
ggsave(
  here("output", "populations", "figures", "neuro_admixture_k=5_r_0.1_trained.pdf"),
  width  = 6,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.2 Inference with all populations

Now we can use all populations but extract the SNPs we trained

plink \
--keep-allele-order \
--bfile output/populations/file7 \
--make-bed \
--export vcf \
--out output/populations/snps_sets/r2_0.1 \
--extract output/populations/nadmix/train/train_r_0.1.snplist \
--silent
grep "samples\|variants" output/populations/snps_sets/r2_0.1.log 
## 82731 variants loaded from .bim file.
## --extract: 58152 variants remaining.
## 58152 variants and 237 people pass filters and QC.

Transfer data to cluster

rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/

Inference mode (projective analysis)

neural-admixture infer --name r_0.1 --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.1 --out_name r_0.1_inference --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/r2_0.1.bed | tee r_0.1_inference.log

2.3 Plot after inference with all populations

Download the data

rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.1 /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results

2.4 Plot inference

# Extract ancestry coefficients
nadmixk5 <- read_delim(
  here("output", "populations", "nadmix", "results", "r_0.1","r_0.1_inference.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
##          X1      X2       X3    X4        X5
##       <dbl>   <dbl>    <dbl> <dbl>     <dbl>
## 1 0.0335    0.0141  0.00335  0.937 0.0118   
## 2 0.000194  0.0182  0.000687 0.977 0.00374  
## 3 0.0000783 0.00649 0.000800 0.990 0.00281  
## 4 0.00245   0.00539 0.00102  0.991 0.0000917
## 5 0.00160   0.00362 0.000710 0.994 0.0000509
## 6 0.000235  0.00773 0.00145  0.989 0.00148

The fam file

fam_file <- here(
  "output", "populations", "snps_sets", "r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

nadmixk5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(nadmixk5)

head(nadmixk5)
##    ind pop           X1          X2           X3        X4           X5
## 1 1001 OKI 3.353009e-02 0.014147708 0.0033505654 0.9372090 1.176264e-02
## 2 1002 OKI 1.938650e-04 0.018223537 0.0006867652 0.9771599 3.735926e-03
## 3 1003 OKI 7.834005e-05 0.006492031 0.0007999630 0.9898221 2.807595e-03
## 4 1004 OKI 2.448580e-03 0.005393364 0.0010208730 0.9910454 9.167830e-05
## 5 1005 OKI 1.595465e-03 0.003623286 0.0007097295 0.9940206 5.091701e-05
## 6 1006 OKI 2.346695e-04 0.007725136 0.0014463243 0.9891122 1.481695e-03

Rename the columns

# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(nadmixk5)
##    ind pop           v1          v2           v3        v4           v5
## 1 1001 OKI 3.353009e-02 0.014147708 0.0033505654 0.9372090 1.176264e-02
## 2 1002 OKI 1.938650e-04 0.018223537 0.0006867652 0.9771599 3.735926e-03
## 3 1003 OKI 7.834005e-05 0.006492031 0.0007999630 0.9898221 2.807595e-03
## 4 1004 OKI 2.448580e-03 0.005393364 0.0010208730 0.9910454 9.167830e-05
## 5 1005 OKI 1.595465e-03 0.003623286 0.0007097295 0.9940206 5.091701e-05
## 6 1006 OKI 2.346695e-04 0.007725136 0.0014463243 0.9891122 1.481695e-03

Import sample locations

sampling_loc <- readRDS(here("output", "local_adaptation", "sampling_loc.rds"))
head(sampling_loc)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN
source(
  here(
    "scripts", "analysis", "my_theme3.R"
  )
)

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Melt the data frame for plotting
# Q_melted <- melt(nadmixk5, id.vars = c("ind", "pop"))
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(Region, Region2, Country, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)

color_palette <-
  c(
    "v1" = "#00FF00",
    "v2" = "#0000FF",
    "v3" = "#FFFF00", 
    "v4" = "#FF0000",
    "v5" = "#FF00FF"  
  )

# Generate all potential variable names
all_variables <- paste0("v", 1:5)

# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
                            color = names(color_palette))

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "populations", "figures", "neuro_admixture_k=5_r_0.1_inference.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3. Train with neutral SNPs

plink2 \
--allow-extra-chr \
--bfile output/populations/neutral \
--make-bed \
--maf 0.1 \
--geno 0.2 \
--extract output/populations/neutral_SNPs.txt \
--out output/populations/nadmix/train/neutral_train \
--keep-fam output/populations/nadmix/pops_2_train.txt \
--write-snplist \
--silent;
grep "samples\|variants" output/populations/nadmix/train/neutral_train.log 
## 237 samples (30 females, 67 males, 140 ambiguous; 237 founders) loaded from
## 9483 variants loaded from output/populations/neutral.bim.
## --extract: 9483 variants remaining.
## --keep-fam: 90 samples remaining.
## 90 samples (19 females, 44 males, 27 ambiguous; 90 founders) remaining after
## --geno: 0 variants removed due to missing genotype data.
## 370 variants removed due to allele frequency threshold(s)
## 9113 variants remaining after main filters.

Now use the snp list to create the same data set with all the samples. Neural admixture expects the same SNPs for training and inference. Because we subseted the data with filter –maf 0.1, we ended up with less SNPs. Then, we can create the full data set with the SNPs that we have after sub-setting.

plink2 \
--allow-extra-chr \
--bfile output/populations/neutral \
--export vcf \
--make-bed \
--maf 0.1 \
--geno 0.2 \
--out output/populations/snps_sets/neutral \
--extract output/populations/nadmix/train/neutral_train.snplist \
--silent;
grep "samples\|variants" output/populations/snps_sets/neutral.log 
## 237 samples (30 females, 67 males, 140 ambiguous; 237 founders) loaded from
## 9483 variants loaded from output/populations/neutral.bim.
## --extract: 9113 variants remaining.
## --geno: 0 variants removed due to missing genotype data.
## 0 variants removed due to allele frequency threshold(s)
## 9113 variants remaining after main filters.

Train with the pckmeans initialization Get two gpu/cpus

salloc --cpus-per-gpu=2 --gpus=2 --time=30:00 --partition gpu_devel

Train

cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;
module load miniconda/23.1.0;
conda activate nadmenv;

# pckmeans r_0.1
neural-admixture train --seed 1234 --initialization pckmeans --warmup_epochs 1000 --max_epochs 1000 --activation relu --optimizer adam --learning_rate 1e-7 --min_k 2 --max_k 10 --name neutral --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/train/neutral_train.bed --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/neutral | tee neutral.log

Download the data

rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/neutral /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results

3.1. Plot after training

# Extract ancestry coefficients
nadmixk5 <- read_delim(
  here("output", "populations", "nadmix", "results", "neutral","neutral.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
##      X1      X2     X3       X4     X5
##   <dbl>   <dbl>  <dbl>    <dbl>  <dbl>
## 1 0.123 0.0366  0.0409 0.00116  0.798 
## 2 0.145 0.00531 0.800  0.0266   0.0222
## 3 0.123 0.0121  0.790  0.0244   0.0508
## 4 0.356 0.0661  0.0194 0.000302 0.558 
## 5 0.314 0.0465  0.0105 0.000110 0.629 
## 6 0.130 0.0100  0.794  0.0272   0.0390

The fam file

fam_file <- here(
  "output", "populations", "nadmix", "train","neutral_train.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

nadmixk5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(nadmixk5)

head(nadmixk5)
##    ind pop        X1          X2         X3           X4         X5
## 1 1001 OKI 0.1231617 0.036582019 0.04086823 0.0011551608 0.79823291
## 2 1002 OKI 0.1453723 0.005305157 0.80046910 0.0266097598 0.02224370
## 3 1003 OKI 0.1228256 0.012105865 0.78983307 0.0244478285 0.05078756
## 4 1004 OKI 0.3562233 0.066148236 0.01942665 0.0003020810 0.55789977
## 5 1005 OKI 0.3138226 0.046455488 0.01050009 0.0001099689 0.62911183
## 6 1006 OKI 0.1302217 0.010048003 0.79356050 0.0271946248 0.03897528

Rename the columns

# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(nadmixk5)
##    ind pop        v1          v2         v3           v4         v5
## 1 1001 OKI 0.1231617 0.036582019 0.04086823 0.0011551608 0.79823291
## 2 1002 OKI 0.1453723 0.005305157 0.80046910 0.0266097598 0.02224370
## 3 1003 OKI 0.1228256 0.012105865 0.78983307 0.0244478285 0.05078756
## 4 1004 OKI 0.3562233 0.066148236 0.01942665 0.0003020810 0.55789977
## 5 1005 OKI 0.3138226 0.046455488 0.01050009 0.0001099689 0.62911183
## 6 1006 OKI 0.1302217 0.010048003 0.79356050 0.0271946248 0.03897528

Import sample locations

sampling_loc <- readRDS(here("output", "local_adaptation", "sampling_loc.rds"))
head(sampling_loc)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN
source(
  here(
    "scripts", "analysis", "my_theme3.R"
  )
)

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(Region, Region2, Country, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)

color_palette <-
  c(
    "v1" = "#00FF00",
    "v2" = "#FF00FF",
    "v3" = "#FFFF00", 
    "v4" = "#0000FF",
    "v5" = "#FF0000"  
  )


# Generate all potential variable names
all_variables <- paste0("v", 1:5)

# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
                            color = names(color_palette))

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture training k5.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

# # # save it
ggsave(
  here("output", "populations", "figures", "neuro_admixture_k=5_neutral_trained.pdf"),
  width  = 6,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.2 Inference with all populations

Transfer data to cluster

rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/

Inference mode (projective analysis)

neural-admixture infer --name neutral --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/neutral --out_name neutral_inference --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/snps_sets/neutral.bed | tee neutral_inference.log

3.3 Plot after inference with all populations

Download the data

rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/neutral /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results

3.4 Plot inference

# Extract ancestry coefficients
nadmixk5 <- read_delim(
  here("output", "populations", "nadmix", "results", "neutral","neutral_inference.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
##      X1      X2     X3       X4     X5
##   <dbl>   <dbl>  <dbl>    <dbl>  <dbl>
## 1 0.117 0.0366  0.0426 0.00120  0.803 
## 2 0.146 0.00545 0.796  0.0295   0.0227
## 3 0.112 0.0123  0.799  0.0267   0.0502
## 4 0.352 0.0681  0.0194 0.000319 0.560 
## 5 0.314 0.0490  0.0105 0.000114 0.626 
## 6 0.120 0.0100  0.803  0.0288   0.0382

The fam file

fam_file <- here(
  "output", "populations", "snps_sets", "neutral.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

nadmixk5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(nadmixk5)

head(nadmixk5)
##    ind pop        X1          X2         X3           X4         X5
## 1 1001 OKI 0.1168938 0.036599070 0.04261101 0.0012043122 0.80269182
## 2 1002 OKI 0.1461714 0.005454067 0.79623282 0.0294865184 0.02265519
## 3 1003 OKI 0.1121285 0.012327265 0.79862928 0.0266750623 0.05023989
## 4 1004 OKI 0.3522965 0.068144552 0.01943193 0.0003188957 0.55980808
## 5 1005 OKI 0.3143196 0.048997048 0.01052473 0.0001135770 0.62604505
## 6 1006 OKI 0.1203695 0.009999427 0.80266857 0.0287723672 0.03819018

Rename the columns

# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(nadmixk5)
##    ind pop        v1          v2         v3           v4         v5
## 1 1001 OKI 0.1168938 0.036599070 0.04261101 0.0012043122 0.80269182
## 2 1002 OKI 0.1461714 0.005454067 0.79623282 0.0294865184 0.02265519
## 3 1003 OKI 0.1121285 0.012327265 0.79862928 0.0266750623 0.05023989
## 4 1004 OKI 0.3522965 0.068144552 0.01943193 0.0003188957 0.55980808
## 5 1005 OKI 0.3143196 0.048997048 0.01052473 0.0001135770 0.62604505
## 6 1006 OKI 0.1203695 0.009999427 0.80266857 0.0287723672 0.03819018

Import sample locations

sampling_loc <- readRDS(here("output", "local_adaptation", "sampling_loc.rds"))
head(sampling_loc)
## # A tibble: 6 × 6
##   Pop_City   Country  Latitude Longitude Region Abbreviation
##   <chr>      <chr>       <dbl>     <dbl> <chr>  <chr>       
## 1 Gelephu    Bhutan       26.9      90.5 Asia   GEL         
## 2 Phnom Penh Cambodia     11.6     105.  Asia   CAM         
## 3 Hainan     China        19.2     110.  Asia   HAI         
## 4 Yunnan     China        24.5     101.  Asia   YUN         
## 5 Hunan      China        27.6     112.  Asia   HUN         
## 6 Bengaluru  India        13.0      77.6 Asia   BEN
source(
  here(
    "scripts", "analysis", "my_theme3.R"
  )
)

# Create a named vector to map countries to regions
country_to_region <- c(
  "Bhutan" = "South Asia",
  "Cambodia" = "Southeast Asia",
  "China" = "East Asia",
  "India" = "South Asia",
  "Indonesia" = "Southeast Asia",
  "Japan" = "East Asia",
  "Malaysia" = "Southeast Asia",
  "Maldives" = "South Asia",
  "Nepal" = "South Asia",
  "Sri Lanka" = "South Asia",
  "Taiwan" = "East Asia",
  "Thailand" = "Southeast Asia",
  "Vietnam" = "Southeast Asia"
)

# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]

# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(Region, Region2, Country, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)

color_palette <-
  c(
    "v1" = "#00FF00",
    "v2" = "#FF00FF",
    "v3" = "#FFFF00", 
    "v4" = "#0000FF",
    "v5" = "#FF0000"  
  )

# Generate all potential variable names
all_variables <- paste0("v", 1:5)

# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
                            color = names(color_palette))

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture inference for k5 with intergenic SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "populations", "figures", "neuro_admixture_k=5_neutral_inference.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)