The software is available at https://github.com/ai-sandbox/neural-admixture
On the HPC we first need to create an interactive session
# Create and navigate to the neuroadmixture directory
cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;
# Create interactive session with 1 CPUs
salloc --partition devel --time=06:00:00 --nodes=1 --ntasks=1 --cpus-per-task=4 --mem-per-cpu=5120 zsh;
# Load miniconda (do not use the default it did not work for me)
module load miniconda/23.1.0
# Create a conda environment
conda create --name nadmenv python=3.9
# environment location: /gpfs/gibbs/project/powell/lvc26/conda_envs/nadmenv
# Activate the new env
conda activate nadmenv
# Install neuro-admixture
pip install neural-admixture --user
# To deactivate it later
conda deactivate
# If you want to remove it
# conda env remove -n nadmenv
source /vast/palmer/apps/avx2/software/miniconda/23.1.0/etc/profile.d/conda.sh;
module load miniconda/23.1.0;
conda activate nadmenv;
export PYTHONPATH=/home/lvc26/project/conda_envs/nadmenv
On the mac laptop
Libraries
Prune the data
plink2 \
--allow-extra-chr \
--bfile output/populations/file7 \
--indep-pairwise 5 1 0.01 \
--out output/populations/nadmix/indepSNP \
--silent;
grep 'pairwise\|variants\|samples' output/populations/nadmix/indepSNP.log
## --indep-pairwise 5 1 0.01
## 237 samples (30 females, 67 males, 140 ambiguous; 237 founders) loaded from
## 82731 variants loaded from output/populations/file7.bim.
## --indep-pairwise (3 compute threads): 61277/82731 variants removed.
Look at the Admixture plot with k=5 and choose the populations with individuals with single ancestry or low admixture (more than 50% of single ancestry)
color_palette <-
c(
"v1" = "#00FF00", # KAN UTS
"v2" = "#FFFF00", # YUN BEN
"v3" = "#0000FF", # INJ INW
"v4" = "#FF00FF", # QNC
"v5" = "#FF0000" # TAI OKI
)
Create a new bed file for training. Then we use the 60 SNPs with all populations to test neuro-admixture
Subset with plink
plink \
--keep-allele-order \
--keep-fam output/populations/nadmix/pops_2_train.txt \
--bfile output/populations/file7 \
--maf 0.1 \
--make-bed \
--out output/populations/nadmix/train/train_r_0.01 \
--extract output/populations/nadmix/indepSNP.prune.in \
--geno 0.2 \
--silent \
--write-snplist
grep "samples\|variants" output/populations/nadmix/train/train_r_0.01.log
## 82731 variants loaded from .bim file.
## --extract: 21454 variants remaining.
## Total genotyping rate in remaining samples is 0.969198.
## 11 variants removed due to missing genotype data (--geno).
## 512 variants removed due to minor allele threshold(s)
## 20931 variants and 79 people pass filters and QC.
Transfer data to cluster (all the data sets are here) - files for inference
rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/train lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix
We can also transfer the files for training
rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/snps_sets lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix
Train neuroadmix with a range of ks Get two gpu/cpus
Unsupervised (multi-head)
We run admixture with these populations and the recommended k=4 instead of k=5. It is probably because KAN shows admixture with KAG. Therefore, we can train neuro-admixture with k=4 and see if it will make the same assignments as admixture did.
cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;
module load miniconda/23.1.0;
conda activate nadmenv;
# pckmeans r_0.01
neural-admixture train --seed 1234 --initialization pckmeans --warmup_epochs 1000 --max_epochs 1000 --activation relu --optimizer adam --learning_rate 1e-7 --min_k 2 --max_k 10 --name r_0.01 --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/train/train_r_0.01.bed --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.01 | tee r_0.01.log
Download the data
rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.01 /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 978096 52.3 1979700 105.8 NA 1295087 69.2
## Vcells 1699816 13.0 8388608 64.0 32768 2166040 16.6
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "r_0.01","r_0.01.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0250 0.0177 0.914 0.0217 0.0219
## 2 0.0493 0.0223 0.901 0.000802 0.0264
## 3 0.0374 0.0207 0.923 0.000681 0.0186
## 4 0.00242 0.00668 0.975 0.0115 0.00489
## 5 0.00159 0.00471 0.982 0.00858 0.00323
## 6 0.0348 0.0217 0.923 0.000646 0.0197
The fam file
fam_file <- here(
"output", "populations", "nadmix", "train","train_r_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.025028508 0.017738435 0.9136265 0.0217335448 0.021873066
## 2 1002 OKI 0.049260873 0.022308363 0.9012038 0.0008023476 0.026424598
## 3 1003 OKI 0.037390847 0.020651398 0.9226472 0.0006807603 0.018629761
## 4 1004 OKI 0.002420123 0.006682593 0.9745043 0.0115027474 0.004890267
## 5 1005 OKI 0.001587119 0.004713184 0.9818867 0.0085788695 0.003234004
## 6 1006 OKI 0.034804370 0.021673145 0.9232054 0.0006456328 0.019671431
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.025028508 0.017738435 0.9136265 0.0217335448 0.021873066
## 2 1002 OKI 0.049260873 0.022308363 0.9012038 0.0008023476 0.026424598
## 3 1003 OKI 0.037390847 0.020651398 0.9226472 0.0006807603 0.018629761
## 4 1004 OKI 0.002420123 0.006682593 0.9745043 0.0115027474 0.004890267
## 5 1005 OKI 0.001587119 0.004713184 0.9818867 0.0085788695 0.003234004
## 6 1006 OKI 0.034804370 0.021673145 0.9232054 0.0006456328 0.019671431
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
# Q_melted <- melt(nadmixk5, id.vars = c("ind", "pop"))
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#FF0000",
"v4" = "#0000FF",
"v5" = "#FF00FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture training k5.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Now we can use all populations but extract the SNPs we trained
plink \
--keep-allele-order \
--bfile output/populations/file7 \
--make-bed \
--export vcf \
--out output/populations/snps_sets/r2_0.01 \
--extract output/populations/nadmix/train/train_r_0.01.snplist \
--silent
grep "samples\|variants" output/populations/snps_sets/r2_0.01.log
## 82731 variants loaded from .bim file.
## --extract: 21095 variants remaining.
## 21095 variants and 237 people pass filters and QC.
Transfer data to cluster
rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/
Inference mode (projective analysis)
Download the data
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "r_0.01","r_0.01_inference.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0242 0.0176 0.913 0.0229 0.0218
## 2 0.0466 0.0223 0.903 0.000826 0.0268
## 3 0.0350 0.0206 0.925 0.000711 0.0188
## 4 0.00230 0.00665 0.974 0.0120 0.00482
## 5 0.00154 0.00487 0.981 0.00912 0.00319
## 6 0.0317 0.0212 0.927 0.000650 0.0195
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.024173928 0.017611718 0.9134678 0.0229028091 0.021843690
## 2 1002 OKI 0.046582617 0.022330839 0.9034133 0.0008259051 0.026847320
## 3 1003 OKI 0.035011258 0.020629292 0.9248377 0.0007113857 0.018810373
## 4 1004 OKI 0.002296278 0.006649634 0.9742270 0.0120037850 0.004823298
## 5 1005 OKI 0.001538968 0.004871406 0.9812847 0.0091172550 0.003187769
## 6 1006 OKI 0.031735279 0.021164291 0.9269638 0.0006500665 0.019486537
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.024173928 0.017611718 0.9134678 0.0229028091 0.021843690
## 2 1002 OKI 0.046582617 0.022330839 0.9034133 0.0008259051 0.026847320
## 3 1003 OKI 0.035011258 0.020629292 0.9248377 0.0007113857 0.018810373
## 4 1004 OKI 0.002296278 0.006649634 0.9742270 0.0120037850 0.004823298
## 5 1005 OKI 0.001538968 0.004871406 0.9812847 0.0091172550 0.003187769
## 6 1006 OKI 0.031735279 0.021164291 0.9269638 0.0006500665 0.019486537
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#FF0000",
"v4" = "#0000FF",
"v5" = "#FF00FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture inference for k5 with 20,931 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Subset with plink
plink \
--keep-allele-order \
--keep-fam output/populations/nadmix/pops_2_train.txt \
--bfile output/populations/file7 \
--maf 0.1 \
--make-bed \
--out output/populations/nadmix/train/train_r_0.1 \
--extract output/populations/indepSNP_chr.prune.in \
--geno 0.2 \
--silent \
--write-snplist
# 57k SNPs
grep "variants\|people" output/populations/nadmix/train/train_r_0.1.log
## 82731 variants loaded from .bim file.
## 237 people (67 males, 30 females, 140 ambiguous) loaded from .fam.
## --extract: 60988 variants remaining.
## --keep-fam: 90 people remaining.
## 0 variants removed due to missing genotype data (--geno).
## 2836 variants removed due to minor allele threshold(s)
## 58152 variants and 90 people pass filters and QC.
Train with the pckmeans initialization
Get two gpu/cpus
Train
cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;
module load miniconda/23.1.0;
conda activate nadmenv;
# pckmeans r_0.1
neural-admixture train --seed 1234 --initialization pckmeans --warmup_epochs 1000 --max_epochs 1000 --activation relu --optimizer adam --learning_rate 1e-7 --min_k 2 --max_k 10 --name r_0.1 --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/train/train_r_0.1.bed --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.1 | tee r_0.1.log
Download the data
rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/r_0.1 /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "r_0.1","r_0.1.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0325 0.0140 0.00322 0.939 0.0118
## 2 0.000184 0.0173 0.000650 0.978 0.00368
## 3 0.0000777 0.00645 0.000763 0.990 0.00281
## 4 0.00225 0.00527 0.000948 0.991 0.0000867
## 5 0.00152 0.00364 0.000695 0.994 0.0000509
## 6 0.000231 0.00756 0.00138 0.989 0.00146
The fam file
fam_file <- here(
"output", "populations", "nadmix", "train","train_r_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.0324859358 0.013972007 0.0032239752 0.9385149 1.180321e-02
## 2 1002 OKI 0.0001844297 0.017312648 0.0006498906 0.9781768 3.676144e-03
## 3 1003 OKI 0.0000777165 0.006451267 0.0007630020 0.9898998 2.808214e-03
## 4 1004 OKI 0.0022484392 0.005267162 0.0009479558 0.9914498 8.670188e-05
## 5 1005 OKI 0.0015169261 0.003644208 0.0006946810 0.9940934 5.088000e-05
## 6 1006 OKI 0.0002308622 0.007557554 0.0013819213 0.9893672 1.462476e-03
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.0324859358 0.013972007 0.0032239752 0.9385149 1.180321e-02
## 2 1002 OKI 0.0001844297 0.017312648 0.0006498906 0.9781768 3.676144e-03
## 3 1003 OKI 0.0000777165 0.006451267 0.0007630020 0.9898998 2.808214e-03
## 4 1004 OKI 0.0022484392 0.005267162 0.0009479558 0.9914498 8.670188e-05
## 5 1005 OKI 0.0015169261 0.003644208 0.0006946810 0.9940934 5.088000e-05
## 6 1006 OKI 0.0002308622 0.007557554 0.0013819213 0.9893672 1.462476e-03
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
# Q_melted <- melt(nadmixk5, id.vars = c("ind", "pop"))
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#0000FF",
"v3" = "#FFFF00",
"v4" = "#FF0000",
"v5" = "#FF00FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture training k5.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Now we can use all populations but extract the SNPs we trained
plink \
--keep-allele-order \
--bfile output/populations/file7 \
--make-bed \
--export vcf \
--out output/populations/snps_sets/r2_0.1 \
--extract output/populations/nadmix/train/train_r_0.1.snplist \
--silent
grep "samples\|variants" output/populations/snps_sets/r2_0.1.log
## 82731 variants loaded from .bim file.
## --extract: 58152 variants remaining.
## 58152 variants and 237 people pass filters and QC.
Transfer data to cluster
rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/
Inference mode (projective analysis)
Download the data
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "r_0.1","r_0.1_inference.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0335 0.0141 0.00335 0.937 0.0118
## 2 0.000194 0.0182 0.000687 0.977 0.00374
## 3 0.0000783 0.00649 0.000800 0.990 0.00281
## 4 0.00245 0.00539 0.00102 0.991 0.0000917
## 5 0.00160 0.00362 0.000710 0.994 0.0000509
## 6 0.000235 0.00773 0.00145 0.989 0.00148
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 3.353009e-02 0.014147708 0.0033505654 0.9372090 1.176264e-02
## 2 1002 OKI 1.938650e-04 0.018223537 0.0006867652 0.9771599 3.735926e-03
## 3 1003 OKI 7.834005e-05 0.006492031 0.0007999630 0.9898221 2.807595e-03
## 4 1004 OKI 2.448580e-03 0.005393364 0.0010208730 0.9910454 9.167830e-05
## 5 1005 OKI 1.595465e-03 0.003623286 0.0007097295 0.9940206 5.091701e-05
## 6 1006 OKI 2.346695e-04 0.007725136 0.0014463243 0.9891122 1.481695e-03
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 3.353009e-02 0.014147708 0.0033505654 0.9372090 1.176264e-02
## 2 1002 OKI 1.938650e-04 0.018223537 0.0006867652 0.9771599 3.735926e-03
## 3 1003 OKI 7.834005e-05 0.006492031 0.0007999630 0.9898221 2.807595e-03
## 4 1004 OKI 2.448580e-03 0.005393364 0.0010208730 0.9910454 9.167830e-05
## 5 1005 OKI 1.595465e-03 0.003623286 0.0007097295 0.9940206 5.091701e-05
## 6 1006 OKI 2.346695e-04 0.007725136 0.0014463243 0.9891122 1.481695e-03
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
# Q_melted <- melt(nadmixk5, id.vars = c("ind", "pop"))
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#0000FF",
"v3" = "#FFFF00",
"v4" = "#FF0000",
"v5" = "#FF00FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
plink2 \
--allow-extra-chr \
--bfile output/populations/neutral \
--make-bed \
--maf 0.1 \
--geno 0.2 \
--extract output/populations/neutral_SNPs.txt \
--out output/populations/nadmix/train/neutral_train \
--keep-fam output/populations/nadmix/pops_2_train.txt \
--write-snplist \
--silent;
grep "samples\|variants" output/populations/nadmix/train/neutral_train.log
## 237 samples (30 females, 67 males, 140 ambiguous; 237 founders) loaded from
## 9483 variants loaded from output/populations/neutral.bim.
## --extract: 9483 variants remaining.
## --keep-fam: 90 samples remaining.
## 90 samples (19 females, 44 males, 27 ambiguous; 90 founders) remaining after
## --geno: 0 variants removed due to missing genotype data.
## 370 variants removed due to allele frequency threshold(s)
## 9113 variants remaining after main filters.
Now use the snp list to create the same data set with all the samples. Neural admixture expects the same SNPs for training and inference. Because we subseted the data with filter –maf 0.1, we ended up with less SNPs. Then, we can create the full data set with the SNPs that we have after sub-setting.
plink2 \
--allow-extra-chr \
--bfile output/populations/neutral \
--export vcf \
--make-bed \
--maf 0.1 \
--geno 0.2 \
--out output/populations/snps_sets/neutral \
--extract output/populations/nadmix/train/neutral_train.snplist \
--silent;
grep "samples\|variants" output/populations/snps_sets/neutral.log
## 237 samples (30 females, 67 males, 140 ambiguous; 237 founders) loaded from
## 9483 variants loaded from output/populations/neutral.bim.
## --extract: 9113 variants remaining.
## --geno: 0 variants removed due to missing genotype data.
## 0 variants removed due to allele frequency threshold(s)
## 9113 variants remaining after main filters.
Train with the pckmeans initialization Get two gpu/cpus
Train
cd /gpfs/ycga/project/caccone/lvc26/neuroadmix;
module load miniconda/23.1.0;
conda activate nadmenv;
# pckmeans r_0.1
neural-admixture train --seed 1234 --initialization pckmeans --warmup_epochs 1000 --max_epochs 1000 --activation relu --optimizer adam --learning_rate 1e-7 --min_k 2 --max_k 10 --name neutral --data_path /gpfs/ycga/project/caccone/lvc26/neuroadmix/train/neutral_train.bed --save_dir /gpfs/ycga/project/caccone/lvc26/neuroadmix/neutral | tee neutral.log
Download the data
rsync -chavzP --stats lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/neuroadmix/neutral /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix/results
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "neutral","neutral.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.123 0.0366 0.0409 0.00116 0.798
## 2 0.145 0.00531 0.800 0.0266 0.0222
## 3 0.123 0.0121 0.790 0.0244 0.0508
## 4 0.356 0.0661 0.0194 0.000302 0.558
## 5 0.314 0.0465 0.0105 0.000110 0.629
## 6 0.130 0.0100 0.794 0.0272 0.0390
The fam file
fam_file <- here(
"output", "populations", "nadmix", "train","neutral_train.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.1231617 0.036582019 0.04086823 0.0011551608 0.79823291
## 2 1002 OKI 0.1453723 0.005305157 0.80046910 0.0266097598 0.02224370
## 3 1003 OKI 0.1228256 0.012105865 0.78983307 0.0244478285 0.05078756
## 4 1004 OKI 0.3562233 0.066148236 0.01942665 0.0003020810 0.55789977
## 5 1005 OKI 0.3138226 0.046455488 0.01050009 0.0001099689 0.62911183
## 6 1006 OKI 0.1302217 0.010048003 0.79356050 0.0271946248 0.03897528
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.1231617 0.036582019 0.04086823 0.0011551608 0.79823291
## 2 1002 OKI 0.1453723 0.005305157 0.80046910 0.0266097598 0.02224370
## 3 1003 OKI 0.1228256 0.012105865 0.78983307 0.0244478285 0.05078756
## 4 1004 OKI 0.3562233 0.066148236 0.01942665 0.0003020810 0.55789977
## 5 1005 OKI 0.3138226 0.046455488 0.01050009 0.0001099689 0.62911183
## 6 1006 OKI 0.1302217 0.010048003 0.79356050 0.0271946248 0.03897528
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#FF00FF",
"v3" = "#FFFF00",
"v4" = "#0000FF",
"v5" = "#FF0000"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture training k5.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Transfer data to cluster
rsync -chavzP --stats /Users/lucianocosme/Library/CloudStorage/Dropbox/Albopictus/manuscript_chip/data/no_autogenous/albo_chip/output/populations/nadmix lvc26@mccleary.ycrc.yale.edu:/gpfs/ycga/project/caccone/lvc26/
Inference mode (projective analysis)
Download the data
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "neutral","neutral_inference.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.117 0.0366 0.0426 0.00120 0.803
## 2 0.146 0.00545 0.796 0.0295 0.0227
## 3 0.112 0.0123 0.799 0.0267 0.0502
## 4 0.352 0.0681 0.0194 0.000319 0.560
## 5 0.314 0.0490 0.0105 0.000114 0.626
## 6 0.120 0.0100 0.803 0.0288 0.0382
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.1168938 0.036599070 0.04261101 0.0012043122 0.80269182
## 2 1002 OKI 0.1461714 0.005454067 0.79623282 0.0294865184 0.02265519
## 3 1003 OKI 0.1121285 0.012327265 0.79862928 0.0266750623 0.05023989
## 4 1004 OKI 0.3522965 0.068144552 0.01943193 0.0003188957 0.55980808
## 5 1005 OKI 0.3143196 0.048997048 0.01052473 0.0001135770 0.62604505
## 6 1006 OKI 0.1203695 0.009999427 0.80266857 0.0287723672 0.03819018
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.1168938 0.036599070 0.04261101 0.0012043122 0.80269182
## 2 1002 OKI 0.1461714 0.005454067 0.79623282 0.0294865184 0.02265519
## 3 1003 OKI 0.1121285 0.012327265 0.79862928 0.0266750623 0.05023989
## 4 1004 OKI 0.3522965 0.068144552 0.01943193 0.0003188957 0.55980808
## 5 1005 OKI 0.3143196 0.048997048 0.01052473 0.0001135770 0.62604505
## 6 1006 OKI 0.1203695 0.009999427 0.80266857 0.0287723672 0.03819018
Import sample locations
## # A tibble: 6 × 6
## Pop_City Country Latitude Longitude Region Abbreviation
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Gelephu Bhutan 26.9 90.5 Asia GEL
## 2 Phnom Penh Cambodia 11.6 105. Asia CAM
## 3 Hainan China 19.2 110. Asia HAI
## 4 Yunnan China 24.5 101. Asia YUN
## 5 Hunan China 27.6 112. Asia HUN
## 6 Bengaluru India 13.0 77.6 Asia BEN
source(
here(
"scripts", "analysis", "my_theme3.R"
)
)
# Create a named vector to map countries to regions
country_to_region <- c(
"Bhutan" = "South Asia",
"Cambodia" = "Southeast Asia",
"China" = "East Asia",
"India" = "South Asia",
"Indonesia" = "Southeast Asia",
"Japan" = "East Asia",
"Malaysia" = "Southeast Asia",
"Maldives" = "South Asia",
"Nepal" = "South Asia",
"Sri Lanka" = "South Asia",
"Taiwan" = "East Asia",
"Thailand" = "Southeast Asia",
"Vietnam" = "Southeast Asia"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Melt the data frame for plotting
Q_melted <- nadmixk5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#FF00FF",
"v3" = "#FFFF00",
"v4" = "#0000FF",
"v5" = "#FF0000"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Map each variable to a name
color_mapping <- data.frame(variable = all_variables,
color = names(color_palette))
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Neuro-admixture inference for k5 with intergenic SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))