To work through this, clone the repository (an RStudio project) at https://github.com/eriqande/make-a-BGP-map to get all the necessary input files, etc. Then open up the RStudio project and run through ‘Make-a-BGP-map-Notebook.Rmd’.
tess3r
. Note that you can’t
use the default version of tess3r
, you have to use my fork
of it, which has some extra functionality.genoscapeRtools
Get those packages like this:
The rest of the packages you need can be downloaded from CRAN. If you
don’t have them you should get them: raster
,
sf
, fields
, downloader
, and
tidyverse
. The last one there gets ggplot2 and a number of
other packages by Hadley Wickham.
You can get those like this:
Some rund of the algorithms indicated a higher number of ancestral populations Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2934774 156.8 5552401 296.6 NA 4207570 224.8
## Vcells 4017714 30.7 8388608 64.0 32768 5815953 44.4
# Extract ancestry coefficients
leak7 <- read_delim(
here("output", "populations", "snps_sets", "neutral.snmf", "K7", "run3","neutral_r3.7.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak7)
## # A tibble: 6 × 7
## X1 X2 X3 X4 X5 X6 X7
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.481 0.0708 0.222 0.00569 0.0413 0.0686 0.110
## 2 0.241 0.0661 0.271 0.0172 0.0506 0.162 0.192
## 3 0.730 0.0596 0.129 0.00957 0.000100 0.000100 0.0716
## 4 0.999 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 5 0.999 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 6 0.721 0.0666 0.110 0.0293 0.000100 0.000100 0.0729
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1001 OKI 0.481261 0.07082050 0.22190700 0.00569292 0.041312000 0.068599300
## 2 1002 OKI 0.240552 0.06607170 0.27105700 0.01724490 0.050601700 0.162441000
## 3 1003 OKI 0.729612 0.05958920 0.12941900 0.00956504 0.000099982 0.000099982
## 4 1004 OKI 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## 5 1005 OKI 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## 6 1006 OKI 0.721117 0.06660400 0.10982200 0.02930780 0.000099982 0.000099982
## X7
## 1 0.11040800
## 2 0.19203200
## 3 0.07161450
## 4 0.00009995
## 5 0.00009995
## 6 0.07294920
Rename the columns
# Rename the columns starting from the third one
leak7 <- leak7 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak7)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1001 OKI 0.481261 0.07082050 0.22190700 0.00569292 0.041312000 0.068599300
## 2 1002 OKI 0.240552 0.06607170 0.27105700 0.01724490 0.050601700 0.162441000
## 3 1003 OKI 0.729612 0.05958920 0.12941900 0.00956504 0.000099982 0.000099982
## 4 1004 OKI 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## 5 1005 OKI 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## 6 1006 OKI 0.721117 0.06660400 0.10982200 0.02930780 0.000099982 0.000099982
## v7
## 1 0.11040800
## 2 0.19203200
## 3 0.07161450
## 4 0.00009995
## 5 0.00009995
## 6 0.07294920
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
Merge with pops
# Add an index column to Q_tibble
leak7$index <- seq_len(nrow(leak7))
# Perform the merge as before
df1 <-
merge(
leak7,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6
## 159 OKI 1001 0.481261 0.07082050 0.22190700 0.00569292 0.041312000 0.068599300
## 160 OKI 1002 0.240552 0.06607170 0.27105700 0.01724490 0.050601700 0.162441000
## 161 OKI 1003 0.729612 0.05958920 0.12941900 0.00956504 0.000099982 0.000099982
## 162 OKI 1004 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## 163 OKI 1005 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## 164 OKI 1006 0.721117 0.06660400 0.10982200 0.02930780 0.000099982 0.000099982
## v7 Latitude Longitude Pop_City Country
## 159 0.11040800 26.5013 127.9454 Okinawa Japan
## 160 0.19203200 26.5013 127.9454 Okinawa Japan
## 161 0.07161450 26.5013 127.9454 Okinawa Japan
## 162 0.00009995 26.5013 127.9454 Okinawa Japan
## 163 0.00009995 26.5013 127.9454 Okinawa Japan
## 164 0.07294920 26.5013 127.9454 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#FF0000",
"v2" = "#0000FF",
"v3" = "#FF00FF",
"v4" = "#D0F0C0",
"v5" = "#00FF00",
"v6" = "#FFFF00",
"v7" = "#AEC6CF"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
Within Eric’s fork of tess3r is a function called tess3Q_map_rasters. It takes input from the objects we have above, but it takes that input as matrices rather than data frames, etc. so there is a little finagling to be done.
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5 v6
## [1,] 0.481261 0.07082050 0.22190700 0.00569292 0.041312000 0.068599300
## [2,] 0.240552 0.06607170 0.27105700 0.01724490 0.050601700 0.162441000
## [3,] 0.729612 0.05958920 0.12941900 0.00956504 0.000099982 0.000099982
## [4,] 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## [5,] 0.999400 0.00009995 0.00009995 0.00009995 0.000099950 0.000099950
## [6,] 0.721117 0.06660400 0.10982200 0.02930780 0.000099982 0.000099982
## v7
## [1,] 0.11040800
## [2,] 0.19203200
## [3,] 0.07161450
## [4,] 0.00009995
## [5,] 0.00009995
## [6,] 0.07294920
For this, we use the above variables in tess3r::tess3Q_map_rasters(). Note the use of namespace addressing for this function rather than loading the whole tess3r package with the library() command.
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(80),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 31378.74 (eff. df= 3.00096 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
# after that, we need to add names of the clusters back onto this raster brick
Q_tibble2 <- leak7 |>
dplyr::select(
-pop, -ind, -index
)
names(genoscape_brick) <- names(Q_tibble2)[]
That gives us a raster brick of Q-values associated with each cell in the raster, but those values are not always constrained between 0 and 1, so we have to massage them a little bit in the next section.
For this we use the function ‘genoscapeRtools::qprob_rando_raster()’. This takes the raster brick that comes out of tess3Q_map_rasters() and does some rescaling and (maybe) some random sampling to return a raster of colors that I hope will do a reliable job of representing (in some way) predicted assignment accuracy over space. See ‘?genoscapeRtools::qprob_rando_raster’ to learn about the scaling options, etc. (However, I am not convinced that all of those options are reliably estimated.)
This will squash the raster brick down to a single RGBA (i.e., four channels, red, green, blue and alpha) raster brick.
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
# #
ggsave(
here("output", "populations", "figures", "lea_neutral_k7_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4704966 251.3 9034088 482.5 NA 9034088 482.5
## Vcells 8786799 67.1 54999240 419.7 32768 68734537 524.5
# Extract ancestry coefficients
leak5 <- read_delim(
here("output", "populations", "snps_sets", "neutral.snmf", "K5", "run3","neutral_r3.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.486 0.156 0.0171 0.0872 0.255
## 2 0.413 0.221 0.0428 0.0816 0.241
## 3 0.609 0.0970 0.0268 0.102 0.165
## 4 0.711 0.123 0.0448 0.0523 0.0687
## 5 0.769 0.128 0.0445 0.0295 0.0289
## 6 0.595 0.117 0.0503 0.0863 0.151
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.485589 0.1555300 0.0171214 0.0872414 0.2545190
## 2 1002 OKI 0.413323 0.2213730 0.0427845 0.0816402 0.2408790
## 3 1003 OKI 0.609096 0.0969784 0.0268309 0.1019500 0.1651450
## 4 1004 OKI 0.711456 0.1227820 0.0447598 0.0522809 0.0687215
## 5 1005 OKI 0.769209 0.1279830 0.0444730 0.0294702 0.0288648
## 6 1006 OKI 0.595327 0.1167970 0.0502820 0.0862996 0.1512940
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.485589 0.1555300 0.0171214 0.0872414 0.2545190
## 2 1002 OKI 0.413323 0.2213730 0.0427845 0.0816402 0.2408790
## 3 1003 OKI 0.609096 0.0969784 0.0268309 0.1019500 0.1651450
## 4 1004 OKI 0.711456 0.1227820 0.0447598 0.0522809 0.0687215
## 5 1005 OKI 0.769209 0.1279830 0.0444730 0.0294702 0.0288648
## 6 1006 OKI 0.595327 0.1167970 0.0502820 0.0862996 0.1512940
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
Merge with pops
# Add an index column to Q_tibble
leak5$index <- seq_len(nrow(leak5))
# Perform the merge as before
df1 <-
merge(
leak5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude
## 159 OKI 1001 0.485589 0.1555300 0.0171214 0.0872414 0.2545190 26.5013
## 160 OKI 1002 0.413323 0.2213730 0.0427845 0.0816402 0.2408790 26.5013
## 161 OKI 1003 0.609096 0.0969784 0.0268309 0.1019500 0.1651450 26.5013
## 162 OKI 1004 0.711456 0.1227820 0.0447598 0.0522809 0.0687215 26.5013
## 163 OKI 1005 0.769209 0.1279830 0.0444730 0.0294702 0.0288648 26.5013
## 164 OKI 1006 0.595327 0.1167970 0.0502820 0.0862996 0.1512940 26.5013
## Longitude Pop_City Country
## 159 127.9454 Okinawa Japan
## 160 127.9454 Okinawa Japan
## 161 127.9454 Okinawa Japan
## 162 127.9454 Okinawa Japan
## 163 127.9454 Okinawa Japan
## 164 127.9454 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#FF0000",
"v2" = "#FFFF00",
"v3" = "#FF00FF",
"v4" = "#0000FF",
"v5" = "#00FF00"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.485589 0.1555300 0.0171214 0.0872414 0.2545190
## [2,] 0.413323 0.2213730 0.0427845 0.0816402 0.2408790
## [3,] 0.609096 0.0969784 0.0268309 0.1019500 0.1651450
## [4,] 0.711456 0.1227820 0.0447598 0.0522809 0.0687215
## [5,] 0.769209 0.1279830 0.0444730 0.0294702 0.0288648
## [6,] 0.595327 0.1167970 0.0502820 0.0862996 0.1512940
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(80),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
# #
ggsave(
here("output", "populations", "figures", "lea_neutral_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4705575 251.4 9034088 482.5 NA 9034088 482.5
## Vcells 8789768 67.1 35199514 268.6 32768 68734537 524.5
leak5 <- read_delim(
here("output", "populations", "snps_sets", "r2_0.01.snmf", "K5", "run2","r2_0.01_r2.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0574 0.0290 0.240 0.0219 0.651
## 2 0.0936 0.0212 0.239 0.0460 0.600
## 3 0.0114 0.00895 0.156 0.0202 0.804
## 4 0.000100 0.0106 0.117 0.000920 0.872
## 5 0.000100 0.0174 0.0831 0.000100 0.899
## 6 0.000100 0.0305 0.154 0.0105 0.805
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.057351200 0.02903100 0.240333 0.021857400 0.651427
## 2 1002 OKI 0.093595600 0.02123130 0.239314 0.045972700 0.599886
## 3 1003 OKI 0.011381600 0.00894785 0.155884 0.020222400 0.803564
## 4 1004 OKI 0.000099990 0.01060030 0.116732 0.000919858 0.871648
## 5 1005 OKI 0.000099982 0.01739940 0.083069 0.000099982 0.899332
## 6 1006 OKI 0.000099990 0.03047430 0.153502 0.010503400 0.805420
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.057351200 0.02903100 0.240333 0.021857400 0.651427
## 2 1002 OKI 0.093595600 0.02123130 0.239314 0.045972700 0.599886
## 3 1003 OKI 0.011381600 0.00894785 0.155884 0.020222400 0.803564
## 4 1004 OKI 0.000099990 0.01060030 0.116732 0.000919858 0.871648
## 5 1005 OKI 0.000099982 0.01739940 0.083069 0.000099982 0.899332
## 6 1006 OKI 0.000099990 0.03047430 0.153502 0.010503400 0.805420
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
Merge with pops
# Add an index column to Q_tibble
leak5$index <- seq_len(nrow(leak5))
# Perform the merge as before
df1 <-
merge(
leak5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude
## 159 OKI 1001 0.057351200 0.02903100 0.240333 0.021857400 0.651427 26.5013
## 160 OKI 1002 0.093595600 0.02123130 0.239314 0.045972700 0.599886 26.5013
## 161 OKI 1003 0.011381600 0.00894785 0.155884 0.020222400 0.803564 26.5013
## 162 OKI 1004 0.000099990 0.01060030 0.116732 0.000919858 0.871648 26.5013
## 163 OKI 1005 0.000099982 0.01739940 0.083069 0.000099982 0.899332 26.5013
## 164 OKI 1006 0.000099990 0.03047430 0.153502 0.010503400 0.805420 26.5013
## Longitude Pop_City Country
## 159 127.9454 Okinawa Japan
## 160 127.9454 Okinawa Japan
## 161 127.9454 Okinawa Japan
## 162 127.9454 Okinawa Japan
## 163 127.9454 Okinawa Japan
## 164 127.9454 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#FFFF00",
"v2" = "#FF00FF",
"v3" = "#00FF00",
"v4" = "#0000FF",
"v5" = "#FF0000"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.057351200 0.02903100 0.240333 0.021857400 0.651427
## [2,] 0.093595600 0.02123130 0.239314 0.045972700 0.599886
## [3,] 0.011381600 0.00894785 0.155884 0.020222400 0.803564
## [4,] 0.000099990 0.01060030 0.116732 0.000919858 0.871648
## [5,] 0.000099982 0.01739940 0.083069 0.000099982 0.899332
## [6,] 0.000099990 0.03047430 0.153502 0.010503400 0.805420
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(80),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
# #
ggsave(
here("output", "populations", "figures", "lea_r2_0.01_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4705885 251.4 11296654 603.4 NA 10963441 585.6
## Vcells 8791454 67.1 40690640 310.5 32768 68734537 524.5
# Extract ancestry coefficients
leak5 <- read_delim(
here("output", "populations", "snps_sets", "r2_0.1.snmf", "K5", "run5","r2_0.1_r5.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00901 0.134 0.601 0.0199 0.236
## 2 0.0206 0.155 0.568 0.0289 0.227
## 3 0.0175 0.103 0.711 0.000100 0.169
## 4 0.00882 0.0695 0.777 0.0136 0.131
## 5 0.00145 0.0634 0.808 0.0136 0.113
## 6 0.0128 0.104 0.723 0.000100 0.160
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.00901176 0.1344870 0.600824 0.019856000 0.235822
## 2 1002 OKI 0.02059050 0.1549340 0.568124 0.028927700 0.227423
## 3 1003 OKI 0.01745480 0.1030830 0.710789 0.000099991 0.168573
## 4 1004 OKI 0.00881626 0.0694981 0.777402 0.013613100 0.130670
## 5 1005 OKI 0.00145040 0.0633980 0.808295 0.013615700 0.113241
## 6 1006 OKI 0.01277640 0.1043520 0.723096 0.000099991 0.159675
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.00901176 0.1344870 0.600824 0.019856000 0.235822
## 2 1002 OKI 0.02059050 0.1549340 0.568124 0.028927700 0.227423
## 3 1003 OKI 0.01745480 0.1030830 0.710789 0.000099991 0.168573
## 4 1004 OKI 0.00881626 0.0694981 0.777402 0.013613100 0.130670
## 5 1005 OKI 0.00145040 0.0633980 0.808295 0.013615700 0.113241
## 6 1006 OKI 0.01277640 0.1043520 0.723096 0.000099991 0.159675
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
Merge with pops
# Add an index column to Q_tibble
leak5$index <- seq_len(nrow(leak5))
# Perform the merge as before
df1 <-
merge(
leak5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude
## 159 OKI 1001 0.00901176 0.1344870 0.600824 0.019856000 0.235822 26.5013
## 160 OKI 1002 0.02059050 0.1549340 0.568124 0.028927700 0.227423 26.5013
## 161 OKI 1003 0.01745480 0.1030830 0.710789 0.000099991 0.168573 26.5013
## 162 OKI 1004 0.00881626 0.0694981 0.777402 0.013613100 0.130670 26.5013
## 163 OKI 1005 0.00145040 0.0633980 0.808295 0.013615700 0.113241 26.5013
## 164 OKI 1006 0.01277640 0.1043520 0.723096 0.000099991 0.159675 26.5013
## Longitude Pop_City Country
## 159 127.9454 Okinawa Japan
## 160 127.9454 Okinawa Japan
## 161 127.9454 Okinawa Japan
## 162 127.9454 Okinawa Japan
## 163 127.9454 Okinawa Japan
## 164 127.9454 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#0000FF",
"v2" = "#FFFF00",
"v3" = "#FF0000",
"v4" = "#FF00FF",
"v5" = "#00FF00"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.00901176 0.1344870 0.600824 0.019856000 0.235822
## [2,] 0.02059050 0.1549340 0.568124 0.028927700 0.227423
## [3,] 0.01745480 0.1030830 0.710789 0.000099991 0.168573
## [4,] 0.00881626 0.0694981 0.777402 0.013613100 0.130670
## [5,] 0.00145040 0.0633980 0.808295 0.013615700 0.113241
## [6,] 0.01277640 0.1043520 0.723096 0.000099991 0.159675
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(80),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
# #
ggsave(
here("output", "populations", "figures", "lea_r2_0.1_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4706114 251.4 11296654 603.4 NA 11296654 603.4
## Vcells 8793004 67.1 39127015 298.6 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "faststructure", "neutral", "run01", "simple.5.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.561 0.000012 0.000012 0.439 0.000012
## 2 0.479 0.109 0.000012 0.412 0.000012
## 3 0.575 0.000012 0.000012 0.425 0.000012
## 4 0.527 0.000012 0.000012 0.473 0.000012
## 5 0.527 0.000012 0.000012 0.473 0.000012
## 6 0.561 0.000012 0.000012 0.438 0.000012
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.560940 0.000012 1.2e-05 0.439025 1.2e-05
## 2 1002 OKI 0.479157 0.109106 1.2e-05 0.411713 1.2e-05
## 3 1003 OKI 0.574956 0.000012 1.2e-05 0.425009 1.2e-05
## 4 1004 OKI 0.526625 0.000012 1.2e-05 0.473340 1.2e-05
## 5 1005 OKI 0.526835 0.000012 1.2e-05 0.473130 1.2e-05
## 6 1006 OKI 0.561466 0.000012 1.2e-05 0.438499 1.2e-05
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.560940 0.000012 1.2e-05 0.439025 1.2e-05
## 2 1002 OKI 0.479157 0.109106 1.2e-05 0.411713 1.2e-05
## 3 1003 OKI 0.574956 0.000012 1.2e-05 0.425009 1.2e-05
## 4 1004 OKI 0.526625 0.000012 1.2e-05 0.473340 1.2e-05
## 5 1005 OKI 0.526835 0.000012 1.2e-05 0.473130 1.2e-05
## 6 1006 OKI 0.561466 0.000012 1.2e-05 0.438499 1.2e-05
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 159 OKI 1001 0.560940 0.000012 1.2e-05 0.439025 1.2e-05 26.5013 127.9454
## 160 OKI 1002 0.479157 0.109106 1.2e-05 0.411713 1.2e-05 26.5013 127.9454
## 161 OKI 1003 0.574956 0.000012 1.2e-05 0.425009 1.2e-05 26.5013 127.9454
## 162 OKI 1004 0.526625 0.000012 1.2e-05 0.473340 1.2e-05 26.5013 127.9454
## 163 OKI 1005 0.526835 0.000012 1.2e-05 0.473130 1.2e-05 26.5013 127.9454
## 164 OKI 1006 0.561466 0.000012 1.2e-05 0.438499 1.2e-05 26.5013 127.9454
## Pop_City Country
## 159 Okinawa Japan
## 160 Okinawa Japan
## 161 Okinawa Japan
## 162 Okinawa Japan
## 163 Okinawa Japan
## 164 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#FF0000",
"v3" = "#FF00FF",
"v4" = "#FFFF00",
"v5" = "#0000FF"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "fastStructure_neutral_simple_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.560940 0.000012 1.2e-05 0.439025 1.2e-05
## [2,] 0.479157 0.109106 1.2e-05 0.411713 1.2e-05
## [3,] 0.574956 0.000012 1.2e-05 0.425009 1.2e-05
## [4,] 0.526625 0.000012 1.2e-05 0.473340 1.2e-05
## [5,] 0.526835 0.000012 1.2e-05 0.473130 1.2e-05
## [6,] 0.561466 0.000012 1.2e-05 0.438499 1.2e-05
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "fastStructure_neutral_simple_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4706419 251.4 11296654 603.4 NA 11296654 603.4
## Vcells 8794782 67.1 37625935 287.1 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "faststructure", "neutral", "run02", "logistic.5.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000012 0.000012 1.00 0.000183 0.000035
## 2 0.872 0.000012 0.128 0.000012 0.000012
## 3 0.000012 0.000063 1.00 0.000014 0.000012
## 4 0.000012 0.000012 1.00 0.000011 0.000015
## 5 0.000012 0.000034 1.00 0.000029 0.000012
## 6 0.000012 0.000013 1.00 0.000046 0.000012
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.000012 1.2e-05 0.999760 0.000183 3.5e-05
## 2 1002 OKI 0.871549 1.2e-05 0.128416 0.000012 1.2e-05
## 3 1003 OKI 0.000012 6.3e-05 0.999899 0.000014 1.2e-05
## 4 1004 OKI 0.000012 1.2e-05 0.999950 0.000011 1.5e-05
## 5 1005 OKI 0.000012 3.4e-05 0.999913 0.000029 1.2e-05
## 6 1006 OKI 0.000012 1.3e-05 0.999918 0.000046 1.2e-05
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.000012 1.2e-05 0.999760 0.000183 3.5e-05
## 2 1002 OKI 0.871549 1.2e-05 0.128416 0.000012 1.2e-05
## 3 1003 OKI 0.000012 6.3e-05 0.999899 0.000014 1.2e-05
## 4 1004 OKI 0.000012 1.2e-05 0.999950 0.000011 1.5e-05
## 5 1005 OKI 0.000012 3.4e-05 0.999913 0.000029 1.2e-05
## 6 1006 OKI 0.000012 1.3e-05 0.999918 0.000046 1.2e-05
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 159 OKI 1001 0.000012 1.2e-05 0.999760 0.000183 3.5e-05 26.5013 127.9454
## 160 OKI 1002 0.871549 1.2e-05 0.128416 0.000012 1.2e-05 26.5013 127.9454
## 161 OKI 1003 0.000012 6.3e-05 0.999899 0.000014 1.2e-05 26.5013 127.9454
## 162 OKI 1004 0.000012 1.2e-05 0.999950 0.000011 1.5e-05 26.5013 127.9454
## 163 OKI 1005 0.000012 3.4e-05 0.999913 0.000029 1.2e-05 26.5013 127.9454
## 164 OKI 1006 0.000012 1.3e-05 0.999918 0.000046 1.2e-05 26.5013 127.9454
## Pop_City Country
## 159 Okinawa Japan
## 160 Okinawa Japan
## 161 Okinawa Japan
## 162 Okinawa Japan
## 163 Okinawa Japan
## 164 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#FF0000",
"v2" = "#0000FF",
"v3" = "#FF00FF",
"v4" = "#FFFF00",
"v5" = "#00FF00"#,
# "v6" = "#008080",
# "v7" = "#FF00FF"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "fastStructure_neutral_logistic_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix
## v1 v2 v3 v4 v5
## [1,] 0.000012 1.2e-05 0.999760 0.000183 3.5e-05
## [2,] 0.871549 1.2e-05 0.128416 0.000012 1.2e-05
## [3,] 0.000012 6.3e-05 0.999899 0.000014 1.2e-05
## [4,] 0.000012 1.2e-05 0.999950 0.000011 1.5e-05
## [5,] 0.000012 3.4e-05 0.999913 0.000029 1.2e-05
## [6,] 0.000012 1.3e-05 0.999918 0.000046 1.2e-05
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "fastStructure_neutral_logistic_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4706681 251.4 11296654 603.4 NA 11296654 603.4
## Vcells 8796457 67.2 43830081 334.4 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "faststructure", "r2_0.01", "run1", "simple.5.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0926 0.274 0.000005 0.633 0.000005
## 2 0.126 0.245 0.000005 0.629 0.000005
## 3 0.0385 0.172 0.000005 0.789 0.000005
## 4 0.000005 0.123 0.000005 0.877 0.000005
## 5 0.000005 0.0852 0.000005 0.915 0.000005
## 6 0.000005 0.188 0.000005 0.800 0.0128
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.092564 0.274411 5e-06 0.633015 0.000005
## 2 1002 OKI 0.125992 0.244922 5e-06 0.629076 0.000005
## 3 1003 OKI 0.038454 0.172095 5e-06 0.789440 0.000005
## 4 1004 OKI 0.000005 0.123287 5e-06 0.876698 0.000005
## 5 1005 OKI 0.000005 0.085162 5e-06 0.914823 0.000005
## 6 1006 OKI 0.000005 0.187602 5e-06 0.799554 0.012834
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.092564 0.274411 5e-06 0.633015 0.000005
## 2 1002 OKI 0.125992 0.244922 5e-06 0.629076 0.000005
## 3 1003 OKI 0.038454 0.172095 5e-06 0.789440 0.000005
## 4 1004 OKI 0.000005 0.123287 5e-06 0.876698 0.000005
## 5 1005 OKI 0.000005 0.085162 5e-06 0.914823 0.000005
## 6 1006 OKI 0.000005 0.187602 5e-06 0.799554 0.012834
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 159 OKI 1001 0.092564 0.274411 5e-06 0.633015 0.000005 26.5013 127.9454
## 160 OKI 1002 0.125992 0.244922 5e-06 0.629076 0.000005 26.5013 127.9454
## 161 OKI 1003 0.038454 0.172095 5e-06 0.789440 0.000005 26.5013 127.9454
## 162 OKI 1004 0.000005 0.123287 5e-06 0.876698 0.000005 26.5013 127.9454
## 163 OKI 1005 0.000005 0.085162 5e-06 0.914823 0.000005 26.5013 127.9454
## 164 OKI 1006 0.000005 0.187602 5e-06 0.799554 0.012834 26.5013 127.9454
## Pop_City Country
## 159 Okinawa Japan
## 160 Okinawa Japan
## 161 Okinawa Japan
## 162 Okinawa Japan
## 163 Okinawa Japan
## 164 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#FFFF00",
"v2" = "#00FF00",
"v3" = "#FF00FF",
"v4" = "#FF0000",
"v5" = "#0000FF"#,
# "v6" = "#008080",
# "v7" = "#FF00FF"
)
Plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.01_simple_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
## v1 v2 v3 v4 v5
## [1,] 0.092564 0.274411 5e-06 0.633015 0.000005
## [2,] 0.125992 0.244922 5e-06 0.629076 0.000005
## [3,] 0.038454 0.172095 5e-06 0.789440 0.000005
## [4,] 0.000005 0.123287 5e-06 0.876698 0.000005
## [5,] 0.000005 0.085162 5e-06 0.914823 0.000005
## [6,] 0.000005 0.187602 5e-06 0.799554 0.012834
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.01_simple_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4706905 251.4 11296654 603.4 NA 11296654 603.4
## Vcells 8798192 67.2 42172630 321.8 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "faststructure", "r2_0.01", "run1", "logistic.5.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000005 0.000005 0.000005 1.00 0.000005
## 2 0.000005 0.000005 0.000005 1.00 0.000005
## 3 0.000005 0.000005 0.000005 1.00 0.000005
## 4 0.000005 0.000005 0.000005 1.00 0.000005
## 5 0.000005 0.000005 0.000005 1.00 0.000005
## 6 0.000005 0.000005 0.000005 1.00 0.000005
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 2 1002 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 3 1003 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 4 1004 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 5 1005 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 6 1006 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 2 1002 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 3 1003 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 4 1004 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 5 1005 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
## 6 1006 OKI 5e-06 5e-06 5e-06 0.99998 5e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude Pop_City
## 159 OKI 1001 5e-06 5e-06 5e-06 0.99998 5e-06 26.5013 127.9454 Okinawa
## 160 OKI 1002 5e-06 5e-06 5e-06 0.99998 5e-06 26.5013 127.9454 Okinawa
## 161 OKI 1003 5e-06 5e-06 5e-06 0.99998 5e-06 26.5013 127.9454 Okinawa
## 162 OKI 1004 5e-06 5e-06 5e-06 0.99998 5e-06 26.5013 127.9454 Okinawa
## 163 OKI 1005 5e-06 5e-06 5e-06 0.99998 5e-06 26.5013 127.9454 Okinawa
## 164 OKI 1006 5e-06 5e-06 5e-06 0.99998 5e-06 26.5013 127.9454 Okinawa
## Country
## 159 Japan
## 160 Japan
## 161 Japan
## 162 Japan
## 163 Japan
## 164 Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#0000FF",
"v3" = "#FFFF00",
"v4" = "#FF00FF",
"v5" = "#FF0000"#,
# "v6" = "#008080",
# "v7" = "#FF00FF"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.01_logistic_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it
## v1 v2 v3 v4 v5
## [1,] 5e-06 5e-06 5e-06 0.99998 5e-06
## [2,] 5e-06 5e-06 5e-06 0.99998 5e-06
## [3,] 5e-06 5e-06 5e-06 0.99998 5e-06
## [4,] 5e-06 5e-06 5e-06 0.99998 5e-06
## [5,] 5e-06 5e-06 5e-06 0.99998 5e-06
## [6,] 5e-06 5e-06 5e-06 0.99998 5e-06
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.01_logistic_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4707202 251.4 11296654 603.4 NA 11296654 603.4
## Vcells 8799708 67.2 40712999 310.7 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "faststructure", "r2_0.1", "run1", "simple.5.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 0.000002 0.0114 0.791 0.198
## 2 0.146 0.000002 0.000002 0.694 0.160
## 3 0.000002 0.000002 0.000002 0.949 0.0510
## 4 0.000002 0.000002 0.000002 1.00 0.000002
## 5 0.000002 0.000002 0.000002 1.00 0.000002
## 6 0.000002 0.000002 0.000002 1.00 0.000002
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.000002 2e-06 0.011353 0.790797 0.197846
## 2 1002 OKI 0.146314 2e-06 0.000002 0.694085 0.159598
## 3 1003 OKI 0.000002 2e-06 0.000002 0.949043 0.050951
## 4 1004 OKI 0.000002 2e-06 0.000002 0.999993 0.000002
## 5 1005 OKI 0.000002 2e-06 0.000002 0.999993 0.000002
## 6 1006 OKI 0.000002 2e-06 0.000002 0.999993 0.000002
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.000002 2e-06 0.011353 0.790797 0.197846
## 2 1002 OKI 0.146314 2e-06 0.000002 0.694085 0.159598
## 3 1003 OKI 0.000002 2e-06 0.000002 0.949043 0.050951
## 4 1004 OKI 0.000002 2e-06 0.000002 0.999993 0.000002
## 5 1005 OKI 0.000002 2e-06 0.000002 0.999993 0.000002
## 6 1006 OKI 0.000002 2e-06 0.000002 0.999993 0.000002
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 159 OKI 1001 0.000002 2e-06 0.011353 0.790797 0.197846 26.5013 127.9454
## 160 OKI 1002 0.146314 2e-06 0.000002 0.694085 0.159598 26.5013 127.9454
## 161 OKI 1003 0.000002 2e-06 0.000002 0.949043 0.050951 26.5013 127.9454
## 162 OKI 1004 0.000002 2e-06 0.000002 0.999993 0.000002 26.5013 127.9454
## 163 OKI 1005 0.000002 2e-06 0.000002 0.999993 0.000002 26.5013 127.9454
## 164 OKI 1006 0.000002 2e-06 0.000002 0.999993 0.000002 26.5013 127.9454
## Pop_City Country
## 159 Okinawa Japan
## 160 Okinawa Japan
## 161 Okinawa Japan
## 162 Okinawa Japan
## 163 Okinawa Japan
## 164 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#FFFF00",
"v2" = "#0000FF",
"v3" = "#FF00FF",
"v4" = "#FF0000",
"v5" = "#00FF00"
)
Plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.1_simple_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.000002 2e-06 0.011353 0.790797 0.197846
## [2,] 0.146314 2e-06 0.000002 0.694085 0.159598
## [3,] 0.000002 2e-06 0.000002 0.949043 0.050951
## [4,] 0.000002 2e-06 0.000002 0.999993 0.000002
## [5,] 0.000002 2e-06 0.000002 0.999993 0.000002
## [6,] 0.000002 2e-06 0.000002 0.999993 0.000002
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
Plot
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.1_simple_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4707418 251.5 11296654 603.4 NA 11296654 603.4
## Vcells 8801115 67.2 39185120 299.0 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "faststructure", "r2_0.1", "run1", "logistic.5.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 0.000002 0.102 0.000002 0.898
## 2 0.000002 0.000002 0.0529 0.0395 0.908
## 3 0.000002 0.000002 0.0328 0.000002 0.967
## 4 0.000002 0.000002 0.0318 0.000002 0.968
## 5 0.000002 0.000002 0.0112 0.000002 0.989
## 6 0.000002 0.000002 0.0348 0.000002 0.965
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 2e-06 2e-06 0.102183 0.000002 0.897812
## 2 1002 OKI 2e-06 2e-06 0.052898 0.039471 0.907627
## 3 1003 OKI 2e-06 2e-06 0.032833 0.000002 0.967162
## 4 1004 OKI 2e-06 2e-06 0.031840 0.000002 0.968155
## 5 1005 OKI 2e-06 2e-06 0.011165 0.000002 0.988830
## 6 1006 OKI 2e-06 2e-06 0.034841 0.000002 0.965154
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 2e-06 2e-06 0.102183 0.000002 0.897812
## 2 1002 OKI 2e-06 2e-06 0.052898 0.039471 0.907627
## 3 1003 OKI 2e-06 2e-06 0.032833 0.000002 0.967162
## 4 1004 OKI 2e-06 2e-06 0.031840 0.000002 0.968155
## 5 1005 OKI 2e-06 2e-06 0.011165 0.000002 0.988830
## 6 1006 OKI 2e-06 2e-06 0.034841 0.000002 0.965154
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude Pop_City
## 159 OKI 1001 2e-06 2e-06 0.102183 0.000002 0.897812 26.5013 127.9454 Okinawa
## 160 OKI 1002 2e-06 2e-06 0.052898 0.039471 0.907627 26.5013 127.9454 Okinawa
## 161 OKI 1003 2e-06 2e-06 0.032833 0.000002 0.967162 26.5013 127.9454 Okinawa
## 162 OKI 1004 2e-06 2e-06 0.031840 0.000002 0.968155 26.5013 127.9454 Okinawa
## 163 OKI 1005 2e-06 2e-06 0.011165 0.000002 0.988830 26.5013 127.9454 Okinawa
## 164 OKI 1006 2e-06 2e-06 0.034841 0.000002 0.965154 26.5013 127.9454 Okinawa
## Country
## 159 Japan
## 160 Japan
## 161 Japan
## 162 Japan
## 163 Japan
## 164 Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#FFFF00",
"v2" = "#0000FF",
"v3" = "#00FF00",
"v4" = "#FF0000",
"v5" = "#FF00FF"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.1_logistic_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it
## v1 v2 v3 v4 v5
## [1,] 2e-06 2e-06 0.102183 0.000002 0.897812
## [2,] 2e-06 2e-06 0.052898 0.039471 0.907627
## [3,] 2e-06 2e-06 0.032833 0.000002 0.967162
## [4,] 2e-06 2e-06 0.031840 0.000002 0.968155
## [5,] 2e-06 2e-06 0.011165 0.000002 0.988830
## [6,] 2e-06 2e-06 0.034841 0.000002 0.965154
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
Plot
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "fastStructure_r2_0.1_logistic_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4707731 251.5 11296654 603.4 NA 11296654 603.4
## Vcells 8802682 67.2 36238448 276.5 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "admixture", "neutral", "run1", "neutral.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.349 0.251 0.0197 0.0318 0.349
## 2 0.314 0.252 0.0244 0.00540 0.405
## 3 0.321 0.225 0.0554 0.00001 0.399
## 4 0.314 0.235 0.0173 0.0558 0.379
## 5 0.300 0.244 0.00001 0.0461 0.410
## 6 0.317 0.226 0.0539 0.0105 0.393
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.348771 0.250551 0.019674 0.031750 0.349254
## 2 1002 OKI 0.313650 0.251651 0.024403 0.005396 0.404899
## 3 1003 OKI 0.321265 0.224595 0.055417 0.000010 0.398712
## 4 1004 OKI 0.313674 0.234637 0.017323 0.055846 0.378520
## 5 1005 OKI 0.299965 0.244414 0.000010 0.046103 0.409507
## 6 1006 OKI 0.317372 0.225772 0.053897 0.010453 0.392506
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.348771 0.250551 0.019674 0.031750 0.349254
## 2 1002 OKI 0.313650 0.251651 0.024403 0.005396 0.404899
## 3 1003 OKI 0.321265 0.224595 0.055417 0.000010 0.398712
## 4 1004 OKI 0.313674 0.234637 0.017323 0.055846 0.378520
## 5 1005 OKI 0.299965 0.244414 0.000010 0.046103 0.409507
## 6 1006 OKI 0.317372 0.225772 0.053897 0.010453 0.392506
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 159 OKI 1001 0.348771 0.250551 0.019674 0.031750 0.349254 26.5013 127.9454
## 160 OKI 1002 0.313650 0.251651 0.024403 0.005396 0.404899 26.5013 127.9454
## 161 OKI 1003 0.321265 0.224595 0.055417 0.000010 0.398712 26.5013 127.9454
## 162 OKI 1004 0.313674 0.234637 0.017323 0.055846 0.378520 26.5013 127.9454
## 163 OKI 1005 0.299965 0.244414 0.000010 0.046103 0.409507 26.5013 127.9454
## 164 OKI 1006 0.317372 0.225772 0.053897 0.010453 0.392506 26.5013 127.9454
## Pop_City Country
## 159 Okinawa Japan
## 160 Okinawa Japan
## 161 Okinawa Japan
## 162 Okinawa Japan
## 163 Okinawa Japan
## 164 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#0000FF",
"v4" = "#FF00FF",
"v5" = "#FF0000"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "admixture_neutral_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.348771 0.250551 0.019674 0.031750 0.349254
## [2,] 0.313650 0.251651 0.024403 0.005396 0.404899
## [3,] 0.321265 0.224595 0.055417 0.000010 0.398712
## [4,] 0.313674 0.234637 0.017323 0.055846 0.378520
## [5,] 0.299965 0.244414 0.000010 0.046103 0.409507
## [6,] 0.317372 0.225772 0.053897 0.010453 0.392506
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
Plot
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "admixture_neutral_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4707992 251.5 11296654 603.4 NA 11296654 603.4
## Vcells 8804157 67.2 41887593 319.6 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "admixture", "r2_0.01", "run1", "r2_0.01.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.317 0.0707 0.00016 0.0522 0.560
## 2 0.279 0.0959 0.0108 0.0445 0.570
## 3 0.249 0.0636 0.0118 0.0126 0.663
## 4 0.243 0.0109 0.00547 0.0332 0.707
## 5 0.232 0.00151 0.00001 0.0344 0.732
## 6 0.250 0.0409 0.0151 0.0381 0.656
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.317249 0.070658 0.000160 0.052193 0.559741
## 2 1002 OKI 0.279230 0.095881 0.010824 0.044486 0.569579
## 3 1003 OKI 0.248873 0.063590 0.011806 0.012550 0.663181
## 4 1004 OKI 0.243349 0.010864 0.005469 0.033171 0.707148
## 5 1005 OKI 0.232079 0.001506 0.000010 0.034445 0.731960
## 6 1006 OKI 0.249958 0.040899 0.015095 0.038120 0.655927
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.317249 0.070658 0.000160 0.052193 0.559741
## 2 1002 OKI 0.279230 0.095881 0.010824 0.044486 0.569579
## 3 1003 OKI 0.248873 0.063590 0.011806 0.012550 0.663181
## 4 1004 OKI 0.243349 0.010864 0.005469 0.033171 0.707148
## 5 1005 OKI 0.232079 0.001506 0.000010 0.034445 0.731960
## 6 1006 OKI 0.249958 0.040899 0.015095 0.038120 0.655927
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 159 OKI 1001 0.317249 0.070658 0.000160 0.052193 0.559741 26.5013 127.9454
## 160 OKI 1002 0.279230 0.095881 0.010824 0.044486 0.569579 26.5013 127.9454
## 161 OKI 1003 0.248873 0.063590 0.011806 0.012550 0.663181 26.5013 127.9454
## 162 OKI 1004 0.243349 0.010864 0.005469 0.033171 0.707148 26.5013 127.9454
## 163 OKI 1005 0.232079 0.001506 0.000010 0.034445 0.731960 26.5013 127.9454
## 164 OKI 1006 0.249958 0.040899 0.015095 0.038120 0.655927 26.5013 127.9454
## Pop_City Country
## 159 Okinawa Japan
## 160 Okinawa Japan
## 161 Okinawa Japan
## 162 Okinawa Japan
## 163 Okinawa Japan
## 164 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#0000FF",
"v4" = "#FF00FF",
"v5" = "#FF0000"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
ggsave(
here("output", "populations", "figures", "admixture_r2_0.01_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.317249 0.070658 0.000160 0.052193 0.559741
## [2,] 0.279230 0.095881 0.010824 0.044486 0.569579
## [3,] 0.248873 0.063590 0.011806 0.012550 0.663181
## [4,] 0.243349 0.010864 0.005469 0.033171 0.707148
## [5,] 0.232079 0.001506 0.000010 0.034445 0.731960
## [6,] 0.249958 0.040899 0.015095 0.038120 0.655927
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "admixture_r2_0.01_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4708207 251.5 11296654 603.4 NA 11296654 603.4
## Vcells 8805553 67.2 40276089 307.3 32768 68734537 524.5
Make plot
# Extract ancestry coefficients
k5run1 <- read_delim(
here("output", "populations", "admixture", "r2_0.1", "run1", "r2_0.1.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run1)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.248 0.129 0.00001 0.00912 0.614
## 2 0.220 0.145 0.00001 0.0184 0.616
## 3 0.181 0.0975 0.00001 0.00001 0.722
## 4 0.147 0.0112 0.00001 0.00516 0.837
## 5 0.105 0.00001 0.00001 0.00001 0.895
## 6 0.171 0.0981 0.00001 0.00001 0.731
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.247788 0.128696 1e-05 0.009116 0.614391
## 2 1002 OKI 0.220143 0.145400 1e-05 0.018377 0.616069
## 3 1003 OKI 0.180728 0.097481 1e-05 0.000010 0.721772
## 4 1004 OKI 0.146729 0.011211 1e-05 0.005160 0.836890
## 5 1005 OKI 0.105331 0.000010 1e-05 0.000010 0.894639
## 6 1006 OKI 0.171413 0.098054 1e-05 0.000010 0.730513
Rename the columns
# Rename the columns starting from the third one
k5run1 <- k5run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run1)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.247788 0.128696 1e-05 0.009116 0.614391
## 2 1002 OKI 0.220143 0.145400 1e-05 0.018377 0.616069
## 3 1003 OKI 0.180728 0.097481 1e-05 0.000010 0.721772
## 4 1004 OKI 0.146729 0.011211 1e-05 0.005160 0.836890
## 5 1005 OKI 0.105331 0.000010 1e-05 0.000010 0.894639
## 6 1006 OKI 0.171413 0.098054 1e-05 0.000010 0.730513
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
# Add an index column to Q_tibble
k5run1$index <- seq_len(nrow(k5run1))
# Perform the merge as before
df1 <-
merge(
k5run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 159 OKI 1001 0.247788 0.128696 1e-05 0.009116 0.614391 26.5013 127.9454
## 160 OKI 1002 0.220143 0.145400 1e-05 0.018377 0.616069 26.5013 127.9454
## 161 OKI 1003 0.180728 0.097481 1e-05 0.000010 0.721772 26.5013 127.9454
## 162 OKI 1004 0.146729 0.011211 1e-05 0.005160 0.836890 26.5013 127.9454
## 163 OKI 1005 0.105331 0.000010 1e-05 0.000010 0.894639 26.5013 127.9454
## 164 OKI 1006 0.171413 0.098054 1e-05 0.000010 0.730513 26.5013 127.9454
## Pop_City Country
## 159 Okinawa Japan
## 160 Okinawa Japan
## 161 Okinawa Japan
## 162 Okinawa Japan
## 163 Okinawa Japan
## 164 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#0000FF",
"v4" = "#FF00FF",
"v5" = "#FF0000"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
# #
# ggsave(
# here("output", "populations", "figures", "admixture_r2_0.1_k5_pie.pdf"),
# width = 12,
# height = 6,
# units = "in",
# device = cairo_pdf
# )
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.247788 0.128696 1e-05 0.009116 0.614391
## [2,] 0.220143 0.145400 1e-05 0.018377 0.616069
## [3,] 0.180728 0.097481 1e-05 0.000010 0.721772
## [4,] 0.146729 0.011211 1e-05 0.005160 0.836890
## [5,] 0.105331 0.000010 1e-05 0.000010 0.894639
## [6,] 0.171413 0.098054 1e-05 0.000010 0.730513
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(40),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.04206056 (eff. df= 26.60001 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
Plot
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
ggspatial::layer_spatial(genoscape_rgba) +
my_theme() +
coord_sf()
Plot
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(
data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"),
color = NA
) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
ggsave(
here("output", "populations", "figures", "admixture_r2_0.1_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Colors
# For manuscript
color_palette2 <-
c(
"v1" = "#00FF00", # red
"v2" = "#FFFF00", # blue
"v3" = "#0000FF", # green
"v4" = "#FF00FF", # yellow
"v5" = "#FF0000" # magenta
)
Create brick
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
resolution = c(400,400),
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
# method = "map.max",
interpol = tess3r::FieldsKrigModel(10),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
# after that, we need to add names of the clusters back onto this raster brick
Q_tibble2 <- k5run1 |>
dplyr::select(
-pop, -ind, -index
)
names(genoscape_brick) <- names(Q_tibble2)[]
Scale
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
df_long <- df_mean |>
gather(key = "segment", value = "value", v1:v5)
# Calculate pie segments for plotting
df_long <- df_long |>
group_by(pop) |>
arrange(pop, segment) |>
mutate(cum_value = cumsum(value),
total = sum(value),
end_angle = 2 * pi * cum_value / total,
start_angle = c(0, head(end_angle, -1))) |>
ungroup()
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
# Adding a nudge value for the x-coordinate
nudge_x <- 1.5
nudge_y <- -.5
ggplot() +
geom_sf(
data = world,
fill = NA,
color = "gray",
lwd = 0.1,
alpha = 0.3
) +
layer_spatial(genoscape_rgba) +
ggforce::geom_arc_bar(
data = df_long,
aes(
x0 = Longitude + nudge_x,
y0 = Latitude + nudge_y,
r0 = 0,
r = 1,
start = start_angle,
end = end_angle,
fill = segment
),
color = NA
) + # This will plot the pie segments without borders
ggforce::geom_circle(
data = df_mean,
aes(
x0 = Longitude + nudge_x,
y0 = Latitude + nudge_y,
r = 1),
inherit.aes = FALSE,
color = "black",
fill = NA,
size = 0.05
) + # This will plot the outer circle
geom_point(
data = df_mean,
aes(x = Longitude, y = Latitude),
color = "black",
size = .3
) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 4,
box.padding = unit(.25, "lines"),
direction = "both",
max.iter = 2000,
force = 1,
max.overlaps = 50,
nudge_x = 3,
nudge_y = 0,
segment.alpha = 0
) +
labs(x = "Longitude",
y = "Latitude") +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60))
ggsave(
here("output", "populations", "figures", "admixture_r2_0.1_MANUSCRIPT.pdf"),
width = 12,
height = 12,
units = "in",
device = cairo_pdf
)
Create scale bars
# Your palette
color_palette2 <-
c(
"v1" = "#00FF00", # green
"v2" = "#FFFF00", # yellow
"v3" = "#0000FF", # blue
"v4" = "#FF00FF", # magenta
"v5" = "#FF0000" # red
)
# Create a dataframe with a row for every step in the range for every variable
df <- expand.grid(variable = names(color_palette2),
value = seq(0, 1, by = 0.1)) # 10 segments
# Function to compute gradient color
compute_gradient <- function(value, color) {
scales::gradient_n_pal(c("white", color))(value) # Use off-white as start color
}
# Compute gradient colors for each row in the dataframe
df$color <- mapply(compute_gradient, df$value, color_palette2[df$variable])
# Plot
ggplot(df, aes(x = 1, y = value, fill = color)) +
geom_tile(width = 0.5, height = 0.1) +
scale_fill_identity() +
scale_y_continuous(breaks = seq(0, 1, by = 0.2)) +
facet_wrap(~variable, scales = "free_y", ncol = 5) +
theme_classic() +
labs(title = "Ancestry coefficients") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.background = element_blank(),
strip.text = element_blank(),
axis.line.x = element_blank(),
axis.line.y = element_blank(),
plot.title.position = "plot", # Centers the title over the entire plot
plot.title = element_text(hjust = 0.5) # Ensures the title text itself is centered
)
ggsave(
here("output", "populations", "figures", "scale_bars.pdf"),
width = 6,
height = 6,
units = "in",
device = cairo_pdf
)
Usa tess3r plotting function
# Convert matrices to data frames
plot_data_df <- as.data.frame(df1[, c("v1", "v2", "v3", "v4", "v5")])
# Convert data.frame to matrix
plot_data_matrix <- as.matrix(plot_data_df)
# Set the class of the object to be tess3Q, in addition to matrix and array
class(plot_data_matrix) <- c("tess3Q", "matrix", "array")
# Extract the coordinates in numeric matrix format
coord_matrix <- as.matrix(df1[, c("Longitude", "Latitude")])
# Use the plot function with tess3Q object and coordinate matrix
plot(x = plot_data_matrix, coord = coord_matrix, method = "map.max",
resolution = c(400, 400),
interpolation.model = FieldsKrigModel(10),
cex = 0.4,
xlab = "Longitude",
ylab = "Latitude",
main = "Ancestry coefficients",
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)))
## Please note that 'maptools' will be retired during October 2023,
## plan transition at your earliest convenience (see
## https://r-spatial.org/r/2023/05/15/evolution4.html and earlier blogs
## for guidance);some functionality will be moved to 'sp'.
## Checking rgeos availability: TRUE
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## This function required to attach maps namespace.
Using ggplot
main_plot <- ggtess3Q(
plot_data_matrix,
coord_matrix,
resolution = c(400, 400),
window = extent(selected_countries)[1:4],
background = TRUE,
map.polygon = selected_countries,
raster.filename = NULL,
interpolation.model = FieldsKrigModel(10),
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2))
) +
my_theme() +
geom_sf(
data = world,
fill = NA,
color = "gray",
lwd = 0.1,
alpha = 0.3
) +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
labs(x = "Longitude",
y = "Latitude") +
geom_point(
data = df_mean,
aes(x = Longitude, y = Latitude),
color = "black",
size = .3
) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(.15, "lines"),
direction = "both",
max.iter = 2000,
force = 1,
max.overlaps = 50,
nudge_x = .5,
nudge_y = 0,
segment.alpha = 0
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.1525661 (eff. df= 26.60002 )
ggsave(
here("output", "populations", "figures", "test1.pdf"),
width = 6,
height = 6,
units = "in",
device = cairo_pdf
)
Add bards
# To add bars
# library(grid)
# library(ggplot2)
#
# Create a data frame for the color bars
color_df <- data.frame(
x = rep(1:5, each = 101),
y = rep(seq(0, 1, length.out = 101), times = 5),
color = c(
colorRampPalette(c("white", "#00FF00"))(101),
colorRampPalette(c("white", "#FFFF00"))(101),
colorRampPalette(c("white", "#0000FF"))(101),
colorRampPalette(c("white", "#FF00FF"))(101),
colorRampPalette(c("white", "#FF0000"))(101)
)
)
# Create the color bar plot with scale bar
color_bar_plot <- ggplot(color_df, aes(x = x, y = y, fill = color)) +
geom_tile(width = .5, height = 0.05) +
scale_fill_identity() +
geom_segment(data = data.frame(), aes(x = 0.55, xend = 5.5,
y = seq(0, 1, by = 0.25), yend = seq(0, 1, by = 0.25)),
inherit.aes = FALSE, size = 0.2, color = "darkgray", linetype = "dotted") +
scale_y_continuous(breaks = seq(0, 1, by = 0.25), limits = c(0, 1)) +
theme_void() +
labs(title = " Ancestry\ncoeficients") +
theme(axis.text.y = element_text(size = 10, color = "gray"),
plot.title = element_text(color = "darkgray"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 30 rows containing missing values (`geom_tile()`).
# Create a viewport to place the color bar plot
vp <- viewport(x = 0.9, y = 0.4, width = 0.2, height = 0.2)
# Create your main plot (e.g., ggtess3Q) and convert it to a grob
main_grob <- ggplotGrob(main_plot)
# Draw the main plot (Uncomment the next two lines when you have your main plot)
grid.newpage()
grid.draw(main_grob)
# Draw the color bar plot inside the defined viewport
pushViewport(vp)
grid.draw(color_bar_grob)
upViewport()
# Save the main plot and color bar plot together in a PDF file
pdf(here("output", "populations", "figures", "test2.pdf"), width = 6, height = 6) # Adjust width and height as needed
grid.newpage()
grid.draw(main_grob)
pushViewport(vp)
grid.draw(color_bar_grob)
upViewport()
dev.off() # Close the PDF device
## quartz_off_screen
## 2
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4809771 256.9 11296654 603.4 NA 11296654 603.4
## Vcells 7537174 57.6 31187996 238.0 32768 68734537 524.5
Import Q
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "r_0.01","r_0.01_inference.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0203 0.701 0.152 0.109 0.0173
## 2 0.0191 0.682 0.184 0.102 0.0130
## 3 0.0163 0.706 0.151 0.0845 0.0416
## 4 0.0341 0.671 0.167 0.109 0.0193
## 5 0.0389 0.692 0.168 0.0892 0.0128
## 6 0.0162 0.714 0.171 0.0784 0.0202
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.02026024 0.7009754 0.1522250 0.10923769 0.01730163
## 2 1002 OKI 0.01906374 0.6816174 0.1840338 0.10228224 0.01300276
## 3 1003 OKI 0.01627573 0.7062859 0.1513138 0.08453301 0.04159156
## 4 1004 OKI 0.03413134 0.6708513 0.1668813 0.10887521 0.01926081
## 5 1005 OKI 0.03889918 0.6916250 0.1675111 0.08921071 0.01275407
## 6 1006 OKI 0.01619347 0.7141496 0.1710819 0.07837918 0.02019598
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.02026024 0.7009754 0.1522250 0.10923769 0.01730163
## 2 1002 OKI 0.01906374 0.6816174 0.1840338 0.10228224 0.01300276
## 3 1003 OKI 0.01627573 0.7062859 0.1513138 0.08453301 0.04159156
## 4 1004 OKI 0.03413134 0.6708513 0.1668813 0.10887521 0.01926081
## 5 1005 OKI 0.03889918 0.6916250 0.1675111 0.08921071 0.01275407
## 6 1006 OKI 0.01619347 0.7141496 0.1710819 0.07837918 0.02019598
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
Merge with pops
# Add an index column to Q_tibble
nadmixk5$index <- seq_len(nrow(nadmixk5))
# Perform the merge as before
df1 <-
merge(
nadmixk5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude
## 159 OKI 1001 0.02026024 0.7009754 0.1522250 0.10923769 0.01730163 26.5013
## 160 OKI 1002 0.01906374 0.6816174 0.1840338 0.10228224 0.01300276 26.5013
## 161 OKI 1003 0.01627573 0.7062859 0.1513138 0.08453301 0.04159156 26.5013
## 162 OKI 1004 0.03413134 0.6708513 0.1668813 0.10887521 0.01926081 26.5013
## 163 OKI 1005 0.03889918 0.6916250 0.1675111 0.08921071 0.01275407 26.5013
## 164 OKI 1006 0.01619347 0.7141496 0.1710819 0.07837918 0.02019598 26.5013
## Longitude Pop_City Country
## 159 127.9454 Okinawa Japan
## 160 127.9454 Okinawa Japan
## 161 127.9454 Okinawa Japan
## 162 127.9454 Okinawa Japan
## 163 127.9454 Okinawa Japan
## 164 127.9454 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#FFFF00",
"v3" = "#FF0000",
"v4" = "#0000FF",
"v5" = "#FF00FF"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.02026024 0.7009754 0.1522250 0.10923769 0.01730163
## [2,] 0.01906374 0.6816174 0.1840338 0.10228224 0.01300276
## [3,] 0.01627573 0.7062859 0.1513138 0.08453301 0.04159156
## [4,] 0.03413134 0.6708513 0.1668813 0.10887521 0.01926081
## [5,] 0.03889918 0.6916250 0.1675111 0.08921071 0.01275407
## [6,] 0.01619347 0.7141496 0.1710819 0.07837918 0.02019598
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(80),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
# #
ggsave(
here("output", "populations", "figures", "neuro-admixture_r_0.01_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4810226 256.9 11296654 603.4 NA 11296654 603.4
## Vcells 8980682 68.6 43400263 331.2 32768 68734537 524.5
Import Q
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "r_0.1","r_0.1_inference.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00224 0.827 0.0444 0.113 0.0130
## 2 0.000621 0.937 0.0307 0.0280 0.00339
## 3 0.00297 0.844 0.0415 0.103 0.00833
## 4 0.00247 0.784 0.0591 0.146 0.00809
## 5 0.00279 0.757 0.0703 0.159 0.0113
## 6 0.00192 0.845 0.0404 0.101 0.0120
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.0022390699 0.8273290 0.04438876 0.11306997 0.012973181
## 2 1002 OKI 0.0006207919 0.9372209 0.03072406 0.02804254 0.003391742
## 3 1003 OKI 0.0029661185 0.8442711 0.04149344 0.10293802 0.008331385
## 4 1004 OKI 0.0024678661 0.7841588 0.05913275 0.14614794 0.008092790
## 5 1005 OKI 0.0027905770 0.7570373 0.07026709 0.15857665 0.011328328
## 6 1006 OKI 0.0019242963 0.8448561 0.04041177 0.10083950 0.011968327
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.0022390699 0.8273290 0.04438876 0.11306997 0.012973181
## 2 1002 OKI 0.0006207919 0.9372209 0.03072406 0.02804254 0.003391742
## 3 1003 OKI 0.0029661185 0.8442711 0.04149344 0.10293802 0.008331385
## 4 1004 OKI 0.0024678661 0.7841588 0.05913275 0.14614794 0.008092790
## 5 1005 OKI 0.0027905770 0.7570373 0.07026709 0.15857665 0.011328328
## 6 1006 OKI 0.0019242963 0.8448561 0.04041177 0.10083950 0.011968327
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
Merge with pops
# Add an index column to Q_tibble
nadmixk5$index <- seq_len(nrow(nadmixk5))
# Perform the merge as before
df1 <-
merge(
nadmixk5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude
## 159 OKI 1001 0.0022390699 0.8273290 0.04438876 0.11306997 0.012973181 26.5013
## 160 OKI 1002 0.0006207919 0.9372209 0.03072406 0.02804254 0.003391742 26.5013
## 161 OKI 1003 0.0029661185 0.8442711 0.04149344 0.10293802 0.008331385 26.5013
## 162 OKI 1004 0.0024678661 0.7841588 0.05913275 0.14614794 0.008092790 26.5013
## 163 OKI 1005 0.0027905770 0.7570373 0.07026709 0.15857665 0.011328328 26.5013
## 164 OKI 1006 0.0019242963 0.8448561 0.04041177 0.10083950 0.011968327 26.5013
## Longitude Pop_City Country
## 159 127.9454 Okinawa Japan
## 160 127.9454 Okinawa Japan
## 161 127.9454 Okinawa Japan
## 162 127.9454 Okinawa Japan
## 163 127.9454 Okinawa Japan
## 164 127.9454 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#0000FF",
"v3" = "#FFFF00",
"v4" = "#FF0000",
"v5" = "#FF00FF"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
Make sure the lat longs are in the correct order and arrangement
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4
## [1,] 0.0022390699 0.8273290 0.04438876 0.11306997
## [2,] 0.0006207919 0.9372209 0.03072406 0.02804254
## [3,] 0.0029661185 0.8442711 0.04149344 0.10293802
## [4,] 0.0024678661 0.7841588 0.05913275 0.14614794
## [5,] 0.0027905770 0.7570373 0.07026709 0.15857665
## [6,] 0.0019242963 0.8448561 0.04041177 0.10083950
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(80),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
color_palette <-
c(
"v1" = "#00FF00",
"v2" = "#0000FF",
"v3" = "#FFFF00",
"v4" = "#FF0000"#,
# "v5" = "#FF00FF"
)
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
# #
ggsave(
here("output", "populations", "figures", "neuro-admixture_r_0.1_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()
Clear memory and environment
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4810491 257.0 11296654 603.4 NA 11296654 603.4
## Vcells 8982410 68.6 41728252 318.4 32768 68734537 524.5
Import Q
# Extract ancestry coefficients
nadmixk5 <- read_delim(
here("output", "populations", "nadmix", "results", "neutral","neutral_inference.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(nadmixk5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.247 0.182 0.210 0.154 0.208
## 2 0.214 0.0904 0.306 0.208 0.182
## 3 0.257 0.179 0.191 0.161 0.211
## 4 0.373 0.109 0.117 0.257 0.144
## 5 0.356 0.110 0.138 0.255 0.141
## 6 0.324 0.162 0.168 0.165 0.181
The fam file
fam_file <- here(
"output", "populations", "snps_sets", "neutral.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.2465583 0.18223503 0.2096153 0.1539955 0.2075958
## 2 1002 OKI 0.2139250 0.09040003 0.3061219 0.2078033 0.1817497
## 3 1003 OKI 0.2573991 0.17856130 0.1912381 0.1614681 0.2113334
## 4 1004 OKI 0.3727467 0.10854463 0.1174154 0.2574833 0.1438100
## 5 1005 OKI 0.3558274 0.10966873 0.1382425 0.2551490 0.1411124
## 6 1006 OKI 0.3244200 0.16158162 0.1678030 0.1649402 0.1812551
Rename the columns
# Rename the columns starting from the third one
nadmixk5 <- nadmixk5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(nadmixk5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.2465583 0.18223503 0.2096153 0.1539955 0.2075958
## 2 1002 OKI 0.2139250 0.09040003 0.3061219 0.2078033 0.1817497
## 3 1003 OKI 0.2573991 0.17856130 0.1912381 0.1614681 0.2113334
## 4 1004 OKI 0.3727467 0.10854463 0.1174154 0.2574833 0.1438100
## 5 1005 OKI 0.3558274 0.10966873 0.1382425 0.2551490 0.1411124
## 6 1006 OKI 0.3244200 0.16158162 0.1678030 0.1649402 0.1812551
Import samples attributes
sampling_loc <- readRDS(here("output", "populations", "sampling_loc.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Region == "Asia"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country
)
head(pops)
## # A tibble: 6 × 5
## Abbreviation Latitude Longitude Pop_City Country
## <chr> <dbl> <dbl> <chr> <chr>
## 1 GEL 26.9 90.5 Gelephu Bhutan
## 2 CAM 11.6 105. Phnom Penh Cambodia
## 3 HAI 19.2 110. Hainan China
## 4 YUN 24.5 101. Yunnan China
## 5 HUN 27.6 112. Hunan China
## 6 BEN 13.0 77.6 Bengaluru India
Merge with pops
# Add an index column to Q_tibble
nadmixk5$index <- seq_len(nrow(nadmixk5))
# Perform the merge as before
df1 <-
merge(
nadmixk5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude
## 159 OKI 1001 0.2465583 0.18223503 0.2096153 0.1539955 0.2075958 26.5013
## 160 OKI 1002 0.2139250 0.09040003 0.3061219 0.2078033 0.1817497 26.5013
## 161 OKI 1003 0.2573991 0.17856130 0.1912381 0.1614681 0.2113334 26.5013
## 162 OKI 1004 0.3727467 0.10854463 0.1174154 0.2574833 0.1438100 26.5013
## 163 OKI 1005 0.3558274 0.10966873 0.1382425 0.2551490 0.1411124 26.5013
## 164 OKI 1006 0.3244200 0.16158162 0.1678030 0.1649402 0.1812551 26.5013
## Longitude Pop_City Country
## 159 127.9454 Okinawa Japan
## 160 127.9454 Okinawa Japan
## 161 127.9454 Okinawa Japan
## 162 127.9454 Okinawa Japan
## 163 127.9454 Okinawa Japan
## 164 127.9454 Okinawa Japan
We used this color palette to make the “structure” plot
color_palette2 <-
c(
"v1" = "#00FF00",
"v2" = "#FF00FF",
"v3" = "#FFFF00",
"v4" = "#0000FF",
"v5" = "#FF0000"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
df2 <- df1 |>
dplyr::rename(
Long = Longitude,
Lat = Latitude
)
long_lat_tibble <- df2 |>
dplyr::select(Long, Lat)
long_lat_matrix <- long_lat_tibble |>
as.matrix()
head(long_lat_matrix)
## Long Lat
## 159 127.9454 26.5013
## 160 127.9454 26.5013
## 161 127.9454 26.5013
## 162 127.9454 26.5013
## 163 127.9454 26.5013
## 164 127.9454 26.5013
Pull off the names of individuals and make a matrix of it:
## v1 v2 v3 v4 v5
## [1,] 0.2465583 0.18223503 0.2096153 0.1539955 0.2075958
## [2,] 0.2139250 0.09040003 0.3061219 0.2078033 0.1817497
## [3,] 0.2573991 0.17856130 0.1912381 0.1614681 0.2113334
## [4,] 0.3727467 0.10854463 0.1174154 0.2574833 0.1438100
## [5,] 0.3558274 0.10966873 0.1382425 0.2551490 0.1411124
## [6,] 0.3244200 0.16158162 0.1678030 0.1649402 0.1812551
## [1] TRUE
Create brick
genoscape_brick <- tess3r::tess3Q_map_rasters(
x = Q_matrix,
coord = long_lat_matrix,
map.polygon = selected_countries,
window = extent(selected_countries)[1:4],
# window = combined_extent,
resolution = c(600,600), # if you want more cells in your raster, set higher
# this next lines need to to be here, but don't do much...
col.palette = tess3r::CreatePalette(color_palette2, length(color_palette2)),
method = "map.max",
interpol = tess3r::FieldsKrigModel(80),
main = "Ancestry coefficients",
xlab = "Longitude",
ylab = "Latitude",
cex = .4
)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 0.02118404 (eff. df= 26.59999 )
genoscape_rgba <- genoscapeRtools::qprob_rando_raster(
TRB = genoscape_brick,
cols = color_palette2,
alpha_scale = 2.0,
abs_thresh = 0.0,
alpha_exp = 1.55,
alpha_chop_max = 255
)
# This adds the info for a regular lat-long projection
crs(genoscape_rgba) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
We can easily plot this with the function layer_spatial from the ggspatial package:
With pies
ggplot() +
layer_spatial(genoscape_rgba) +
geom_spatial_point(data = long_lat_tibble,
mapping = aes(x = Long, y = Lat),
size = .2) +
geom_text_repel(
data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines")
) +
labs(x = "Longitude",
y = "Latitude") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
my_theme() +
scale_fill_manual(values = color_palette2) +
guides(fill = "none") + # Hide legend
coord_sf()
## Assuming `crs = 4326` in stat_spatial_identity()
# #
ggsave(
here("output", "populations", "figures", "neuro-admixture_neutral_k5_interpolated_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## Assuming `crs = 4326` in stat_spatial_identity()