In this markdown I write out the bash scripts I used on the cluster to align sequences to the reference genome, do quality control on the bam files, and select sites to do a hard genotype call using 819 samples.

1. Convert bam files to fastq

I received bam files aligned to AalbF2 and I needed to map the reads to the AalBF3 genome assembly.

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4             
#SBATCH --mem-per-cpu=20gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=bam2fastq
#SBATCH -o picard.%A_%a.bam2fastq.o.txt
#SBATCH -e picard.%A_%a.bam2fastq.ERROR.txt

module load picard/2.9.0-Java-1.8.0_121
cd /ycga-gpfs/scratch60/caccone/lvc26/fastq

samplesheet="bams5.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

name=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $1}'`
r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'`

java -Xmx80g -jar /$EBROOTPICARD/picard.jar SamToFastq VALIDATION_STRINGENCY=LENIENT \
I=$r1 \
FASTQ=$name\.R1.fastq.gz SECOND_END_FASTQ=$name\.R2.fastq.gz \
UNPAIRED_FASTQ=$name\.0.fastq.gz

2. Map sequences to reference genome using BWA MEM

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10            
#SBATCH --mem-per-cpu=4gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=bwaMEM
#SBATCH -o bwa.%A_%a.bwa.o.txt
#SBATCH -e bwa.%A_%a.bwa.ERROR.txt
#SBATCH --requeue
 
module load BWA/0.7.17-foss-2016b
module load HTSlib/1.4.1-foss-2016a
module load SAMtools/1.9-foss-2016b

cd /gpfs/ycga/project/caccone/lvc26/september_2020/bams

samplesheet="fastq.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

name=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $1}'`
r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'` 
r2=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $3}'`

bwa mem -t 20 /gpfs/ycga/project/caccone/lvc26/september_2020/genome/aedes_albopictus_LA2_20200826.fasta \
$r1 $r2 | samtools view -@ 20 -bS - | samtools sort -m 1G -@ 20 -O BAM -o $name\.sorted.bam -

3. Index bam files with samtools

I know, I could have piped it with the last script but I did it afterwards.

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10            
#SBATCH --mem-per-cpu=4gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=INDEX
#SBATCH -o samtools.%A_%a.samtools.o.txt
#SBATCH -e samtools.%A_%a.samtools.ERROR.txt
#SBATCH --requeue                 

module load SAMtools/1.9-foss-2016b

cd /gpfs/ycga/scratch60/caccone/lvc26/dedup

samplesheet="bams4.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'`

samtools index -@ 20 $r1

4. Mark PCR duplicates

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10             
#SBATCH --mem-per-cpu=9gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=PICARD
#SBATCH -o picard.%A_%a.picard.o.txt
#SBATCH -e picard.%A_%a.picard.ERROR.txt
#SBATCH --requeue

pwd; hostname; date 

module load picard/2.9.0-Java-1.8.0_121

cd /gpfs/ycga/scratch60/caccone/lvc26/dedup

samplesheet="bams4.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

name=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $1}'`
r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'`

java -Xmx90G -jar $EBROOTPICARD/picard.jar MarkDuplicates \
I=$r1 O=$name\.dedup.bam M=$name\.dedup.txt VALIDATION_STRINGENCY=LENIENT

5. Index bam files with samtools

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2            
#SBATCH --mem-per-cpu=20gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=INDEX
#SBATCH -o samtools.%A_%a.samtools.o.txt
#SBATCH -e samtools.%A_%a.samtools.ERROR.txt
#SBATCH --requeue                 

module load SAMtools/1.9-foss-2016b

cd /gpfs/ycga/scratch60/caccone/lvc26/dedup

samplesheet="bams4.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'`

samtools index -@ 4 $r1

6. Create headers

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2           
#SBATCH --mem-per-cpu=5gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=a.PICARD
#SBATCH -o picard.%A_%a.picard.o.txt
#SBATCH -e picard.%A_%a.picard.ERROR.txt
#SBATCH --requeue

pwd; hostname; date

module load picard/2.9.0-Java-1.8.0_121

cd /gpfs/ycga/scratch60/caccone/lvc26/header

samplesheet="bams4.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

name=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $1}'`
r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'`

java -Xmx90G -jar $EBROOTPICARD/picard.jar AddOrReplaceReadGroups \
I=$r1 \
O=$name\.header.bam \
RGSM=$name RGLB=wgs RGPL=illumina RGPU=none VALIDATION_STRINGENCY=LENIENT

6. Create indel intervals for realignment

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1           
#SBATCH --mem-per-cpu=10gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=gatk3_indel
#SBATCH -o gatk3.%A_%a.gatk.o.txt
#SBATCH -e gatk3.%A_%a.gatk.ERROR.txt

module load GATK/3.8-0-Java-1.8.0_121

cd /ycga-gpfs/scratch60/caccone/lvc26/indel

samplesheet="bams6.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

name=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $1}'` 
r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'` 

java -Xmx20g -jar $EBROOTGATK/GenomeAnalysisTK.jar \
-T RealignerTargetCreator \
-R /gpfs/ycga/project/caccone/lvc26/september_2020/genome/aedes_albopictus_LA2_20200826.fasta \
-I $r1 \
-o $name\.realigner.intervals

7. Perform indel realignment

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2           
#SBATCH --mem-per-cpu=10gb                 
#SBATCH --time=120:00:00                         
#SBATCH --array=1-819
#SBATCH --job-name=GATK_IR
#SBATCH -o gatk3.IR.%A_%a.gatk.o.txt
#SBATCH -e gatk3.IR.%A_%a.gatk.ERROR.txt

module load GATK/3.8-1-0-gf15c1c3ef-Java-1.8

cd /ycga-gpfs/scratch60/caccone/lvc26/indel

samplesheet="bams.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

name=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $1}'` 
r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'` 
r2=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $3}'` 

java -Xmx80g -jar $EBROOTGATK/GenomeAnalysisTK.jar \
-T IndelRealigner \
-R /gpfs/ycga/project/caccone/lvc26/september_2020/genome/aedes_albopictus_LA2_20200826.fasta \
-I $r1 \
-targetIntervals $r2 \
-o $name\.realigned.bam

8. Run SNP Cleaner pipeline to find sites

I used this R script below along with the SNP cleaner script to find sites with good sequence coverage. I did run it for each population and changed the NUM_SAMS and MIN_IND according to the number of mosquitoes in each population.

After running this for each population, it created 4 directories with a list of sites “excl”, “filt”, “unfilt”, and “keeps”. Because we divide the calls into chunks of 10Mb, I concatenate the keeps sites in the next step.

#############################################################################
# Make a commands file to identify set of 'robust' sites for analysis with 
# ANGSD using samtools -> bcftools -> SNPcleaner
# 
# OUTPUT: .sh text file with one commands per line.  Execute to produce sites
# files in chunks for use with ANGSD.  
#
# Execute commands using GNU parallel, for example with 16 jobs in parallel:
# parallel -j 16 -a commands.sh
# (note that the pbzip commands are multithreaded so each command uses >1 CPU) 
#
# Combine sites files in whatever group desired
# POP=Tampon
# cat ./keeps/$POP.pileup.FILT.*.chnk.*.sites | sort -k 2,2n > ./$POP.pileup.FILT.sites 
#
## Dependencies #######
# snpCleaner.pl
# samtools added to $PATH
# pbzip2 installed
# bcftools added to $PATH
# module load R/3.5.0-foss-2016b-avx2
# module load dSQ/0.96
#############################################################################

## Name of population for processing
POP='Name_of_population_here' 

## Today's date, to keep records if different runs/versions are required
date = 'December_5_2020'

#############################################################################
## Define file handles and make directories
# Path to text file with full path to bams, one per line
#BAM_FILES=paste('/path/to/bam/File.',POP,'.bams', sep = '')
BAM_FILES = paste('/ycga-gpfs/project/caccone/lvc26/september_2020/pops/bams.',
                  POP,
                  '.txt',
                  sep = '')

## Output commands file name
OUTPUT_FILE = paste(
  '/ycga-gpfs/project/caccone/lvc26/september_2020/sites/',
  POP,
  '.mpile.snpcln.',
  date,
  '.txt',
  sep = ''
)

## Sites file directory and prefix
DIRS = paste('/ycga-gpfs/project/caccone/lvc26/september_2020/sites/',
             POP,
             '/sites',
             sep = '')

## make subdirs
system(paste('mkdir -p -m 777 ', DIRS, sep = ''))
system(paste('mkdir -m 777 ', DIRS, '/excl', sep = '')) # for excluded sites
system(paste('mkdir -m 777 ', DIRS, '/filt', sep = ''))
system(paste('mkdir -m 777 ', DIRS, '/unfilt', sep = ''))
system(paste('mkdir -m 777 ', DIRS, '/keeps', sep = ''))

#############################################################################
## Set variables

REF='/ycga-gpfs/project/caccone/lvc26/september_2020/genome/aedes_albopictus_LA2_20200826.fasta'
FAI = read.table(paste(REF,'.fai',sep = ''), header = F) ## samtools fasta index
#SNPCLNR='/gpfs/ycga/project/caccone/lvc26/genome/snpCleaner.pl'
SNPCLNR='/ycga-gpfs/project/caccone/lvc26/september_2020/snpCleaner.pl'
## MIN_IND: Minimum number indivuals to include (specifically, only analyze sites with at least MIN_IND individuals with at least 2 reads)
## MAXD: And maximum depth combined across individuals per site
NUM_SAMS = as.numeric(nrow(read.table(BAM_FILES, header = F)))
if (NUM_SAMS <= 9) {
  MIN_IND = NUM_SAMS
  MAXD = MIN_IND * 3 * 15
} else if (NUM_SAMS > 9) {
  MIN_IND = 9
  MAXD = (NUM_SAMS * 0.8) * 3 * 15
}

# SNPcleaner thresholds
f = 1e-4 # min p-value for map quality bias
b = 1e-10 # min p-value for base quality bias
L = 1e-6 # min p-value for exact test of heterozygote deficit
H = 1e-6 # min p-value for exact test of heterozygote excess
a = 0 # minimum number of alternate alleles for site
u = 2 # minimum read depth for an individual (required by -k)

## Unchanged defaults in SNPcleaner
# S = 0.0001 # min p-value for strand bias
# Q = 10 # minimum RMS mapping quality for SNPS
# d = 2 # minium site read depth
# e = 0.0001 # min p-value for end distance bias 
# h = 0.0001 # min p-value for exact test of HWE
# F = 0 # inbreeding coefficient value
# v # process non-variant sites



#############################################################################
## Make commands file
# on Farnam  export PATH=$PATH:/home/lvc26/project/samtools/samtools-1.3.1
MOD = 'export PATH=$PATH:/home/lvc26/project/software/pbzip2-1.1.13; module load SAMtools/1.3.1-foss-2016b BCFtools/1.5-foss-2016b;'
## Start with a clean file
try(system(paste('rm ', file, sep = '')), silent = TRUE)

## loop over scaffolds
for (i in 1:nrow(FAI)) {
  if (i %% 1000 == 0)
    cat(' --> Now on scaff ', i, '\n')
  
  
  ## Define
  chr = as.character(FAI[i, 1])
  
  
  ## Break up large scaffolds into 10MB chunks
  chnks = floor(as.numeric(as.character(FAI[i, 2])) / 10e6) + 1
  
  ## Loop over chunks
  for (c in 1:chnks) {
    ## Define start and stop of chunk
    end = c * 10e6
    strt = end - 10e6 + 1
    
    ## Construct full command
    cmmnd = paste(
      MOD,
      ' samtools mpileup -f ',
      REF,
      ' -b ',
      BAM_FILES,
      ' -q 10 -Q 20 -I -u -t SP,DP -r \'',
      chr,
      ':',
      as.integer(strt),
      '-',
      as.integer(end),
      '\' | bcftools call -f GQ -c - | pbzip2 -p3 -cz --best > ',
      DIRS,
      '/unfilt/',
      POP,
      '.pileup.unfilt.',
      chr,
      '.chnk.',
      c,
      '.vcf.bz2 && bzcat ',
      DIRS,
      '/unfilt/',
      POP,
      '.pileup.unfilt.',
      chr,
      '.chnk.',
      c,
      '.vcf.bz2 |perl ',
      SNPCLNR,
      ' -v -k ',
      MIN_IND,
      ' -u ',
      u,
      ' -D ',
      MAXD,
      ' -a ',
      a,
      ' -f ',
      f,
      ' -H ',
      H,
      ' -L ',
      L,
      ' -b ',
      b,
      ' -p ',
      DIRS,
      '/excl/',
      POP,
      '.pileup.EXCL.',
      chr,
      '.chnk.',
      c,
      '.vcf.bz2 | pbzip2 -p3 -cz --best > ',
      DIRS,
      '/filt/',
      POP,
      '.pileup.FILT.',
      chr,
      '.chnk.',
      c,
      '.vcf.bz2 && bzcat ',
      DIRS,
      '/filt/',
      POP,
      '.pileup.FILT.',
      chr,
      '.chnk.',
      c,
      '.vcf.bz2 | grep \'DP=\' |cut -f1,2 > ',
      DIRS,
      '/keeps/',
      POP,
      '.pileup.FILT.',
      chr,
      '.chnk.',
      c,
      '.sites',
      sep = ''
    )
    
    ## Append command (cmmnd) as new line on OUTPUT_FILE
    write.table(
      cmmnd,
      file = OUTPUT_FILE,
      append = T,
      quote = F,
      col.names = F,
      row.names = F
    )
    
  }
}

After running this R code above it creates a file with 2198 lines (one for each chunk). I used dSQ (dead simple queue) tool to submit as sbatch. Below are the first two lines of one population as an example.

# This is only an example
samtools mpileup -f /home/lvc26/project/albopictus/genome/albo.genome.fasta -b /home/lvc26/project/albopictus/bams/bams.jp.txt -q 10 -Q 20 -I -u -t SP,DP -r 'NW_021837045.1:190000001-200000000' | bcftools call -f GQ -c - | pbzip2 -p3 -cz --best > /home/lvc26/project/albopictus/angsd/jp/sites/unfilt/jp.pileup.unfilt.NW_021837045.1.chnk.20.vcf.bz2 && bzcat /home/lvc26/project/albopictus/angsd/jp/sites/unfilt/jp.pileup.unfilt.NW_021837045.1.chnk.20.vcf.bz2 |perl /home/lvc26/project/albopictus/angsd/snpCleaner.pl -v -k 12 -u 2 -D 864 -a 0 -f 1e-04 -H 1e-06 -L 1e-06 -b 1e-10 -p /home/lvc26/project/albopictus/angsd/jp/sites/excl/jp.pileup.EXCL.NW_021837045.1.chnk.20.vcf.bz2 | pbzip2 -p3 -cz --best > /home/lvc26/project/albopictus/angsd/jp/sites/filt/jp.pileup.FILT.NW_021837045.1.chnk.20.vcf.bz2 && bzcat /home/lvc26/project/albopictus/angsd/jp/sites/filt/jp.pileup.FILT.NW_021837045.1.chnk.20.vcf.bz2 | grep 'DP=' |cut -f1,2 > /home/lvc26/project/albopictus/angsd/jp/sites/keeps/jp.pileup.FILT.NW_021837045.1.chnk.20.sites
samtools mpileup -f /home/lvc26/project/albopictus/genome/albo.genome.fasta -b /home/lvc26/project/albopictus/bams/bams.jp.txt -q 10 -Q 20 -I -u -t SP,DP -r 'NW_021837046.1:60000001-70000000' | bcftools call -f GQ -c - | pbzip2 -p3 -cz --best > /home/lvc26/project/albopictus/angsd/jp/sites/unfilt/jp.pileup.unfilt.NW_021837046.1.chnk.7.vcf.bz2 && bzcat /home/lvc26/project/albopictus/angsd/jp/sites/unfilt/jp.pileup.unfilt.NW_021837046.1.chnk.7.vcf.bz2 |perl /home/lvc26/project/albopictus/angsd/snpCleaner.pl -v -k 12 -u 2 -D 864 -a 0 -f 1e-04 -H 1e-06 -L 1e-06 -b 1e-10 -p /home/lvc26/project/albopictus/angsd/jp/sites/excl/jp.pileup.EXCL.NW_021837046.1.chnk.7.vcf.bz2 | pbzip2 -p3 -cz --best > /home/lvc26/project/albopictus/angsd/jp/sites/filt/jp.pileup.FILT.NW_021837046.1.chnk.7.vcf.bz2 && bzcat /home/lvc26/project/albopictus/angsd/jp/sites/filt/jp.pileup.FILT.NW_021837046.1.chnk.7.vcf.bz2 | grep 'DP=' |cut -f1,2 > /home/lvc26/project/albopictus/angsd/jp/sites/keeps/jp.pileup.FILT.NW_021837046.1.chnk.7.sites

Below is a dsq file as an example

#!/bin/bash
#SBATCH --partition=scavenge
#SBATCH --array 0-2198
#SBATCH --output /dev/null
#SBATCH --job-name Foshan
#SBATCH --mem-per-cpu 6g -t 20:00:00 --mail-type ALL

# DO NOT EDIT LINE BELOW
/ycga-gpfs/apps/hpc/software/dSQ/0.96/dSQBatch.py /gpfs/ycga/project/caccone/lvc26/september_2020/sites/Foshan.mpile.snpcln.October_20_2020.txt /gpfs/ycga/project/caccone/lvc26/september_2020/sites

Below is the SNP cleaner script (snpCleaner.pl)

#!/usr/bin/perl

# SNPcleaner.pl
# Author: Tyler Linderoth, tylerp.linderoth@gmail.com
my $version = "2.4.1";

# TODO:
#    Add argument check
#    Output smaller BED file (join contiguous regions)

use strict;
use warnings;
use Getopt::Std;
use IO::Compress::Bzip2;

my %opts = ('?'=> 0, 'd'=>2, 'D'=>1000000, 'k'=>1, 'u'=>0, 'a'=>2, 'Q'=>10, 'S'=>1e-4, 'b'=>1e-100, 'f'=>0, 'e'=>1e-4, 'h'=>1e-4, 'F'=>0, 'H'=>0, 'L'=>0, 'A'=>undef,
        'M'=>undef, 'B'=>undef, 'p'=>undef, 'r'=>undef, 'X'=>undef, 't'=>undef, 'g'=>undef, 'v'=>undef, '2'=>undef, 'q'=>2);

getopts('?2d:D:k:u:a:Q:S:b:f:e:h:F:H:L:A:M:B:p:r:X:q:tgv', \%opts);

die (qq/
#####################
# SNPcleaner v$version #
#####################
This scripts works with bcftools vcf file format to filter SNPs. This script is only for 
SNP filtering and will ignore INDELs. 

#######################################
Usage: snpCleaner.pl [options] <in.vcf>
or
cat <in.vcf> | snpCleaner.pl [options]

Options:
-?   help
-2   keep non-biallelic sites
-q INT   ploidy [$opts{q}]
-d INT   minimum site read depth [$opts{d}]
-D INT   maximum site read depth [$opts{D}]
-k INT   minimum number of 'covered' individuals (requires -u) [$opts{k}] 
-u INT   minimum read depth for an individual to be considered 'covered' (requires -k) [$opts{u}]
-a INT   minimum number of alternate alleles for site [$opts{a}]
-Q INT   minimum RMS mapping quality for SNPS [$opts{Q}]
-S FLOAT min p-value for strand bias [$opts{S}]
-b FLOAT min p-value for base quality bias [$opts{b}]
-f FLOAT min p-value for map quality bias [$opts{f}]
-e FLOAT min p-value for end distance bias [$opts{e}]
-h FLOAT min p-value for exact test of HWE [$opts{h}]
-F FLOAT inbreeding coefficient value [$opts{F}]
-H FLOAT min p-value for exact test of excess of heterozygous [$opts{H}]
-L FLOAT min p-value for exact test of defect of heterozygous [$opts{L}]
-A FILE  ANCESTRAL fasta file (with FAI on same folder)
-M CHAR  mutation type(s) to remove (ex. 'CT_GA') (requires -A)
-B CHAR  name of BED file for kept SNP positions
-p CHAR  name of file to dump sites that failed filters (bziped)
-r FILE  list of contigs to exclude
-X FILE  BED file of exonic regions (sorted from lowest to highest contig)
-t       filter non-exonic sites (requires -X)
-g   filter exons with SNPs out of HWE (requires -X)
-v   process nonvariants

Generating input VCF with BCFtools:
-run 'bcftools mpileup' with '-a SP,DP' to provide depth and strand bias info
-run 'bcftools call' using '-f GC -c'

An example for how to generate the input VCF file:
bcftools mpileup -f <reference fasta> -b <bam list> -a SP,DP | bcftools call -f GQ -c - > unfiltered_sites.vcf 

Consider using 'bcftools call --skip-variants indels' since these sites will
be ignored by snpCleaner anyways.

Output Notes:
-bed format file is zero-based.
-Characters preceeding filtered sites (dumped with option -p) indicate filters that the sites
 failed to pass and correspond to the option flags (e.g. 'S' indicates strand bias).
\n/) if ($opts{'?'} || (![0] && -t STDIN));

# Argument check
if($opts{t} || $opts{g}) {
    die(qq/option -X (exonic region BED file) is required for options -t and -g\n/) unless ($opts{X});
}
if($opts{M}) {
    die(qq/option -A (ANCESTRAL fasta file) is required for option -M\n/) unless ($opts{A});
}
if($opts{k} !~ m/^\d+$/g || $opts{u} !~ m/^\d+$/g) {
    die(qq/option -k and -u are inter-dependent (min number of -k INT individuals with less than -u INT coverage)\n/);
}

if ($opts{q} < 1) {
    die(qq/Invalid ploidy option -q: $opts{q} ... ploidy should be >= 1\n/);
}
else
{
    if ($opts{q} != 2 && ($opts{h} + $opts{H} + $opts{L}) > 0) {
        print STDERR "Warning: Hardy-Weinberg filtering only applicable for diploid data (-q 2)\n";
    }
}

my ($excont, @t, @seq, @buffer, %exons, %ancestral);
my ($n_sites, $ind_dp, $prev_format, @format, @ind_depth) = (0,0,"");

#open some necessary filehandles
open(BED, '>', $opts{B}) or die("ERROR: could not create BED file: $!") if($opts{B});
my $bz2 = new IO::Compress::Bzip2($opts{p}, 'Append' => 0) if($opts{p});


# Read file with list of excluded contigs
if($opts{r}) {
    open(RMV, '<', $opts{r}) or die("ERROR: could not open excluded CONTIGS file: ");
    $excont = join("\t", <RMV>);
    close(RMV);
}

# Read FASTA index file
if($opts{A}) {
    open(FASTA, '<', $opts{A}) or die("ERROR: could not open FASTA file: ");
    binmode(FASTA);
    
    open(FAI, '<', $opts{A}.".fai") or die("ERROR: could not open FAI file: ");
    while(<FAI>){
    chomp;
    my @fai = split(/\t/);
    $ancestral{$fai[0]}{'length'} = $fai[1];
    $ancestral{$fai[0]}{'start'} = $fai[2];
    $ancestral{$fai[0]}{'n_chars'} = $fai[3];
    $ancestral{$fai[0]}{'n_bytes'} = $fai[4];
    }
    close(FAI);
}

# Read exon/intron information file
if($opts{X}) {
    open(EXON, '<', $opts{X}) or die("ERROR: could not open EXON file: ");
    while (<EXON>) {
    my @exon = split(/\s+/);
    push( @{$exons{$exon[0]}}, {'start' => $exon[1], 'end' => $exon[2], 'HWE' => 0} );
    }
    close(EXON);
}


my ($prev_contig, $cur_contig, $pos) = ('start','start',0);
 = "\t"; #for formatting printed output

# check if input VCF is from  or STDIN
if (-t STDIN)
{
    die ("No input VCF\n") unless ;
}

# The core loop
while (<>) {
    my $violate = ''; # for flagging filter violations

    @t = split;

    # Skip (and print) header lines
    (print, next) if(m/^#/);

    # Update position
    $pos = $t[1];

    # If contig changed
    if($t[0] ne $cur_contig){
    $prev_contig = $cur_contig;
    $cur_contig = $t[0];
    }

    # Skip sites with unknown ref
    $violate .= 'N' if ($t[3] eq 'N');

    # Skip non-variable sites
    $violate .= 'v' if ($t[4] eq '.' && !$opts{v});

    # Skip Indels
    if (length($t[3]) > 1 || length($t[4]) > 1) {
    $violate .= 'I';
    next;
    }

    # Skip sites from excluded contigs
    $violate .= 'r' if ($opts{r} && $excont =~ /\b$cur_contig\b/);

    # Skip non-biallelic Sites

    if( scalar(split(/,/,$t[4])) > 1 && !$opts{2} ) {
    $violate .= '2';
    }

    # Read ANC base from FASTA
    my $anc_base;
    if($opts{A}) {
    my $n_lines = int($pos / $ancestral{$cur_contig}{'n_chars'} - 1e-6);
    my $extra_bytes_per_line = $ancestral{$cur_contig}{'n_bytes'} - $ancestral{$cur_contig}{'n_chars'};
    seek(FASTA, $ancestral{$cur_contig}{'start'} + $pos - 1 + $n_lines*$extra_bytes_per_line, 0);
    read(FASTA, $anc_base, 1);
    $anc_base = 'N' if($anc_base =~ m/[RYSWKMBDHV]/i);
    warn("WARNING: invalid ancestral base at ",$cur_contig,", pos ",$pos,": ",$anc_base,".\n") if($anc_base !~ m/[ACTGN]/i);

    # Skip non-biallelic sites (major and minor differ from ANCESTRAL)
    $violate .= "m($anc_base)" if($anc_base !~ m/\Q$t[3]\E|\Q$t[4]\E|N/i && $t[4] ne '.');
    }

    # Skip sites with specified mutation types
    if ($opts{M} && $t[3] =~ m/[ATCG]/i && $t[4] =~ m/[ATCG]/i) { # if site is variable
    my $refalt=$t[3].$t[4];
    my $altref=$t[4].$t[3];

    if ($opts{M} =~ /($refalt|$altref)/i) {
        my @mutype = split("", );
        $violate .= "M" if ($anc_base !~ m/$mutype[1]/i);
    }   
    }
    
    # Eveness across individuals coverage filter
    # check where in vcf FORMAT the DP ID is    
    if( $prev_format ne $t[8] ){
    $ind_dp = 0;
    $prev_format = $t[8];
    @format = split(":",$t[8]);
    foreach (@format) {
        $ind_dp++ if( ne 'DP');
        last if( eq 'DP');
    }
    }

    # count how many individuals have coverage >= $opts{u}
    my $covcount = 0;
    my @genoinfo = @t[9 .. $#t];
    if( $ind_dp <= $#format ) { # if DP is missing in vcf skip even coverage filter
    for(my $i=0; $i <= $#genoinfo; $i++) {
        my @ind_info = split(":", $genoinfo[$i]);
        $covcount++ if ($ind_info[$ind_dp] >= $opts{u});
        $ind_depth[$i] += $ind_info[$ind_dp];
    }
    $violate .= 'k' if ($covcount < $opts{k});  
    }else {
    warn("WARNING: no individual depth information at ",$cur_contig,", pos ",$pos,". Check for \"samtools mpileup -D\" option.");
    }

    # Site coverage 
    my ($dp, $dp_alt) = (0,0);
    if ($t[7] =~ m/DP4=(\d+),(\d+),(\d+),(\d+)/i) {
    $dp =  +  +  + ;
    $dp_alt =  + ;
    }
    $violate .= 'd' if ($dp < $opts{d});
    $violate .= 'D' if ($dp > $opts{D});
    $violate .= 'a' if ($dp > 0 && $dp_alt < $opts{a});

    # Root-mean-square mapping quality of covering reads
    my $mq =  if ($t[7] =~ m/MQ=(\d+)/i);
    $violate .= 'Q' if ($mq && $mq < $opts{Q});

    # Strand, baseQ, mapQ, and tail distance bias
    my ($strand, $baseqb, $mapqb, $tail_dist);
    if ($t[7] =~ m/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/) {
    $strand = ;
    $baseqb = ;
    $mapqb = ;
    $tail_dist = ;
    }
    $violate .= 'S' if ($strand && $strand < $opts{S});
    $violate .= 'b' if ($baseqb && $baseqb < $opts{b});
    $violate .= 'f' if ($mapqb && $mapqb < $opts{f});
    $violate .= 'e' if ($tail_dist && $tail_dist < $opts{e});

    # Identify non-exonic regions
    my $exon_id = -1;
    if ($opts{X})  {
    my $n_exons = scalar(@{$exons{$cur_contig}});
    for($exon_id = 0; $exon_id < $n_exons; $exon_id++) {
        last if ($pos >= $exons{$cur_contig}[$exon_id]{'start'} && 
             $pos <= $exons{$cur_contig}[$exon_id]{'end'});
    }
    $exon_id = -1 if($exon_id >= $n_exons);
    $violate .= 't' if ( $opts{t} && $exon_id < 0 );
    }
    
    # HWE exact test
    if ($opts{q} == 2 && $t[4] ne '.') {
    my %genocount = (homoa => 0, homob => 0, het => 0);
    foreach (@genoinfo) {
        if (/0\/0:/) {
        $genocount{homoa}++;
        } elsif (/1\/1:/) {
        $genocount{homob}++;
        } elsif (/0\/1:|1\/0:/) {
        $genocount{het}++;
        }
    }

    my ($pHWE, $pHI, $pLOW) = hwe_exact($genocount{het},$genocount{homoa},$genocount{homob},$opts{F});
    die(qq/Genotype counts less than 0\n/) if $pHWE == -1;
    if ($pHWE < $opts{h}) {
        $violate .= "h(p=$pHWE)";
        $exons{$cur_contig}[$exon_id]{'HWE'} = 1 if($exon_id > -1);
    }
    if ($pHI < $opts{H}) {
        $violate .= "H(p=$pHI)";
        $exons{$cur_contig}[$exon_id]{'HWE'} = 1 if($exon_id > -1);
    }
    if ($pLOW < $opts{L}) {
        $violate .= "L(p=$pLOW)";
        $exons{$cur_contig}[$exon_id]{'HWE'} = 1 if($exon_id > -1);
    }
    }

    # remove exons with SNPs out of HWE
    unless( $#buffer < 0 || 
        ($exon_id >= 0 &&
         $cur_contig eq $buffer[0]{'contig'} && 
         $exon_id == $buffer[0]{'exon_id'}) ) {

    print_buffer($opts{g}, \@buffer, \%exons);
    undef @buffer;
    }
    
    push( @buffer, {'contig' => $cur_contig, 'exon_id' => $exon_id, 'pos' => $pos, 'violate' => $violate, 'vcf' => join("\t",@t)} );
    $n_sites++;
}
# Force printing of last entry
print_buffer($opts{g}, \@buffer, \%exons);

print(STDERR $n_sites." sites processed!\n");
# Print per-individual depth
for(my $i=0; $i <= $#ind_depth; $i++) {
    print(STDERR "Ind ".($i+1)." depth:\t".($ind_depth[$i]/$n_sites)."\n");
}


close FASTA if($opts{A});
close BED if($opts{B});
$bz2->close if($opts{p});


exit(0);


#################
### Functions ###
#################
sub print_buffer {
    my ($opts_g, $buffer, $exons) = @_;

    foreach my $g (@$buffer) {
    $g->{'violate'} .= 'g' if($opts_g && $g->{'exon_id'} >= 0 && $exons->{$g->{'contig'}}[$g->{'exon_id'}]{'HWE'} == 1);
    
    if( !$g->{'violate'} ){
        print($g->{'vcf'}."\n");
        print(BED $g->{'contig'}."\t".($g->{'pos'}-1)."\t".$g->{'pos'}."\n") if $opts{B};
    }else{
        $bz2->print($g->{'violate'}."\t".$g->{'vcf'}."\n") if($opts{p});
    }
    }
}



sub hwe_exact {

# Citation:
# Implements an exact SNP test of Hardy-Weinberg Equilibrium as described in Wigginton et al. 2005
# note that probabilities are calculated from the midpoint in order to take advantage of the recurrence
# relationships recognized in Guo and Thompson (1992) in the implementation of their MCMC sampler

    my ($obs_hets, $obs_homa, $obs_homb, $F) = @_;

    return(-1) if ($obs_hets < 0 || $obs_homa < 0 || $obs_homb < 0);

    my $obs_homr; #rare homozygote
    my $obs_homc; #commmon homozygote
    my $n = $obs_homa + $obs_homb + $obs_hets; # total number genotypes

    # define common and rare homozygotes
    if ($obs_homa > $obs_homb) {
    $obs_homc = $obs_homa;
    $obs_homr = $obs_homb;
    } elsif ($obs_homa < $obs_homb) {
    $obs_homc = $obs_homb;
    $obs_homr = $obs_homa;
    } elsif ($obs_homa == $obs_homb) { # need to check how matching number homos affects algorithm
    $obs_homc = $obs_homa;
    $obs_homr = $obs_homb;
    }

    my $rare = 2 * $obs_homr + $obs_hets; # number of minor alleles

    # theta for inbreeding HWE calculations
    my $pc = 1 - $rare/(2*$n);
    my $pr = 1 - $pc;
    my $pCC = $pc**2 + $pc*$pr*$F;
    my $pCR = 2*$pc*$pr - 2*$pc*$pr*$F;
    my $pRR = $pr**2 + $pc*$pr*$F;
    $pRR = 1e-6 if($pRR == 0);
    my $theta = ($pCR**2)/($pCC*$pRR);
    $theta = 1e-6 if($theta == 0);

    # initialize heterozygote probability array
    my @probs;
    for (my $i = 0; $i <= $rare; $i++) {
    $probs[$i] = 0.0;
    }

    # find midpoint of the minor allele count distribution
    my $mid = int($rare * (2 * $n - $rare) / (2 * $n));
    $mid = $mid + 1 if ( ($mid % 2) != ($rare % 2) ); # ensures number minor alleles and midpoint have parity

    my $curr_hets = $mid;
    my $curr_homr = ($rare - $mid) / 2;
    my $curr_homc = $n - $curr_hets - $curr_homr;

    $probs[$mid] = 1.0;
    my $sum = $probs[$mid];

    # calculate probabilities from midpoint down 
    for ($curr_hets = $mid; $curr_hets > 1; $curr_hets -= 2) {
    $probs[$curr_hets - 2] = $probs[$curr_hets] * $curr_hets * ($curr_hets - 1) / 
        ($theta * ($curr_homr + 1) * ($curr_homc + 1));
    $sum += $probs[$curr_hets - 2];

    # 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote
    $curr_homr++;
    $curr_homc++;
    }

    # calculate  probabilities from midpoint up
    $curr_hets = $mid;
    $curr_homr = ($rare - $mid) / 2;
    $curr_homc = $n - $curr_hets - $curr_homr;

    for ($curr_hets = $mid; $curr_hets <= $rare - 2; $curr_hets += 2) {
    $probs[$curr_hets + 2] = $probs[$curr_hets] * $theta * $curr_homr * $curr_homc /
        (($curr_hets + 2) * ($curr_hets + 1));
    $sum += $probs[$curr_hets + 2];
    
    # add 2 heterozygotes for next interation -> subtract one rare, one common homozygote
    $curr_homr--;
    $curr_homc--;
    }
    for (my $i = 0; $i <= $rare; $i++) {
    $probs[$i] /= $sum;
    }

    # p-value calculation for hwe
    my $p_hwe = 0.0;
    for (my $i = 0; $i <= $rare; $i++) {
    next if ($probs[$i] > $probs[$obs_hets]);
    $p_hwe += $probs[$i];
    }
    $p_hwe = 1.0 if ($p_hwe > 1);

    # alternate p-value calculation for p_hi/p_low heterozygous
    my $p_hi = $probs[$obs_hets];
    for (my $i = $obs_hets + 1; $i <= $rare; $i++) {
    $p_hi += $probs[$i];
    }
    my $p_low = $probs[$obs_hets];
    for (my $i = $obs_hets - 1; $i >= 0; $i--) {
    $p_low += $probs[$i];
    }
#    my $p_hi_low;
#    if ($p_hi < $p_low) {
#   $p_hi_low = 2 * $p_hi;
#    } else {
#   $p_hi_low = 2 * $p_low
#    }

    return($p_hwe, $p_hi, $p_low);
}

How I submmited the dsq jobs

# for dsq
module load dSQ/0.96

##############################################
                 Beijing_China
##############################################

dsq --job-file Beijing_China.mpile.snpcln.October_12_2020.txt --output /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name Beijing_China 
dsqa Beijing_China.mpile.snpcln.October_12_2020.txt job_10367110_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Guangzhou_China
##############################################

dsq --job-file Guangzhou_China.mpile.snpcln.November_16_2020.txt --output /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name Guangzhou_China 
dsqa Guangzhou_China.mpile.snpcln.November_16_2020.txt job_10500031_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan

##############################################
                 Hainan_China
##############################################

dsq --job-file Hainan_China.mpile.snpcln.November_16_2020.txt --output /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name Hainan_China 
dsqa Hainan_China.mpile.snpcln.November_16_2020.txt job_10500332_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Honiara_Solomon_Islands
##############################################

dsq --job-file Honiara_Solomon_Islands.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Honiara_Solomon_Islands 
dsqa Honiara_Solomon_Islands.mpile.snpcln.October_13_2020.txt job_10368448_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Houston18_USA
##############################################


dsq --job-file Houston18_USA.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Houston18_USA 
dsqa Houston18_USA.mpile.snpcln.October_13_2020.txt job_10369429_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Ipoh_Malaysia
##############################################


dsq --job-file Ipoh_Malaysia.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Ipoh_Malaysia 
dsqa Ipoh_Malaysia.mpile.snpcln.October_13_2020.txt job_10369819_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Jacksonville_Florida_USA
##############################################


dsq --job-file Jacksonville_Florida_USA.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Jacksonville_Florida_USA 
dsqa Jacksonville_Florida_USA.mpile.snpcln.October_13_2020.txt job_10369939_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Jakarta_Indonesia
##############################################


dsq --job-file Jakarta_Indonesia.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Jakarta_Indonesia 
dsqa Jakarta_Indonesia.mpile.snpcln.October_13_2020.txt job_10369961_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 Jardin_Panteon
##############################################


dsq --job-file Jardin_Panteon.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Jardin_Panteon 
dsqa Jardin_Panteon.mpile.snpcln.October_13_2020.txt job_10369987_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Kathmandu_Nepal
##############################################


dsq --job-file Kathmandu_Nepal.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Kathmandu_Nepal 
dsqa Kathmandu_Nepal.mpile.snpcln.October_13_2020.txt job_10370052_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Kin_Village_Okinawa_Japan
##############################################


dsq --job-file Kin_Village_Okinawa_Japan.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Kin_Village_Okinawa_Japan 
dsqa Kin_Village_Okinawa_Japan.mpile.snpcln.October_13_2020.txt job_10370084_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Kuala_Lumpur
##############################################


dsq --job-file Kuala_Lumpur.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Kuala_Lumpur 
dsqa Kuala_Lumpur.mpile.snpcln.October_13_2020.txt job_10370117_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Limbe_Cameroon
##############################################


dsq --job-file Limbe_Cameroon.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Limbe_Cameroon 
dsqa Limbe_Cameroon.mpile.snpcln.October_13_2020.txt job_10370139_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 LosAngeles_USA
##############################################


dsq --job-file LosAngeles_USA.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name LosAngeles_USA 
dsqa LosAngeles_USA.mpile.snpcln.October_13_2020.txt job_10370160_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Maceio_Brazil
##############################################


dsq --job-file Maceio_Brazil.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Maceio_Brazil 
dsqa Maceio_Brazil.mpile.snpcln.October_13_2020.txt job_10370193_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Manassas_VA_USA
##############################################


dsq --job-file Manassas_VA_USA.mpile.snpcln.October_13_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Manassas_VA_USA 
dsqa Manassas_VA_USA.mpile.snpcln.October_13_2020.txt job_10370223_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Montpelier_France
##############################################


dsq --job-file Montpelier_France.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Montpelier_France 
dsqa Montpelier_France.mpile.snpcln.October_14_2020.txt job_10378126_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 New_Zion_South_Carolina_USA
##############################################


dsq --job-file New_Zion_South_Carolina_USA.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name New_Zion_South_Carolina_USA 
dsqa New_Zion_South_Carolina_USA.mpile.snpcln.October_14_2020.txt job_10378447_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Oak_Hill_Florida_USA
##############################################


dsq --job-file Oak_Hill_Florida_USA.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Oak_Hill_Florida_USA 
dsqa Oak_Hill_Florida_USA.mpile.snpcln.October_14_2020.txt job_10378491_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Pahokee_FL_USA
##############################################


dsq --job-file Pahokee_FL_USA.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Pahokee_FL_USA 
dsqa Pahokee_FL_USA.mpile.snpcln.October_14_2020.txt job_10378543_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Palm_Beach_Florida_USA
##############################################


dsq --job-file Palm_Beach_Florida_USA.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Palm_Beach_Florida_USA 
dsqa Palm_Beach_Florida_USA.mpile.snpcln.October_14_2020.txt job_10378560_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 Papua_Indonesia
##############################################


dsq --job-file Papua_Indonesia.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Papua_Indonesia 
dsqa Papua_Indonesia.mpile.snpcln.October_14_2020.txt job_10378614_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Prto_Igu_Msns_Argentina
##############################################


dsq --job-file Prto_Igu_Msns_Argentina.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Prto_Igu_Msns_Argentina 
dsqa Prto_Igu_Msns_Argentina.mpile.snpcln.October_14_2020.txt job_10378656_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 Salem_New_Jersey_USA
##############################################


dsq --job-file Salem_New_Jersey_USA.mpile.snpcln.October_14_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Salem_New_Jersey_USA 
dsqa Salem_New_Jersey_USA.mpile.snpcln.October_14_2020.txt job_10378690_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan





##############################################
                 Sendai_Japan
##############################################


dsq --job-file Sendai_Japan.mpile.snpcln.November_17_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Sendai_Japan 
dsqa Sendai_Japan.mpile.snpcln.November_17_2020.txt job_10503588_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 Shanghai_China
##############################################


dsq --job-file Shanghai_China.mpile.snpcln.October_16_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Shanghai_China 
dsqa Shanghai_China.mpile.snpcln.October_16_2020.txt job_10388110_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 StAugustine_Trinidad
##############################################


dsq --job-file StAugustine_Trinidad.mpile.snpcln.October_16_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name StAugustine_Trinidad 
dsqa StAugustine_Trinidad.mpile.snpcln.October_16_2020.txt job_10387744_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 Sumba_Indonesia
##############################################


dsq --job-file Sumba_Indonesia.mpile.snpcln.October_16_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Sumba_Indonesia 
dsqa Sumba_Indonesia.mpile.snpcln.October_16_2020.txt job_10388809_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 Tampon
##############################################


dsq --job-file Tampon.mpile.snpcln.October_16_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Tampon 
dsqa Tampon.mpile.snpcln.October_16_2020.txt job_10388978_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Tanegashima_Japan
##############################################


dsq --job-file Tanegashima_Japan.mpile.snpcln.October_16_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Tanegashima_Japan 
dsqa Tanegashima_Japan.mpile.snpcln.October_16_2020.txt job_10389175_status.tsv > failedjobs.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan




##############################################
                 Tokyo_Japan
##############################################


dsq --job-file Tokyo_Japan.mpile.snpcln.October_16_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Tokyo_Japan 
dsqa Tokyo_Japan.mpile.snpcln.October_16_2020.txt job_10389216_status.tsv > failedjobsTokyo_Japan.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Vero_Beach_FL_USA
##############################################


dsq --job-file Vero_Beach_FL_USA.mpile.snpcln.October_16_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Vero_Beach_FL_USA 
dsqa Vero_Beach_FL_USA.mpile.snpcln.October_16_2020.txt job_10389432_status.tsv > failedjobsVeroBeach.txt 2> report.txt
# dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan 
# dsqa failedjobs.txt job_9361930_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Wajima_Japan
##############################################


dsq --job-file Wajima_Japan.mpile.snpcln.October_19_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Wajima_Japan 
dsqa Wajima_Japan.mpile.snpcln.October_19_2020.txt job_10398951_status.tsv > failedjobsWajima_Japan.txt 2> report.txt
#dsq --job-file failedjobsWajima_Japan.txt -o /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Wajima_Japan2 
#dsqa failedjobsWajima_Japan.txt job_10396375_status.tsv > failedjobs2.txt 2> report2.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan



##############################################
                 Yunnan_China
##############################################


dsq --job-file Yunnan_China.mpile.snpcln.October_19_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Yunnan_China 
dsqa Yunnan_China.mpile.snpcln.October_19_2020.txt job_10399257_status.tsv > failedjobsYunnan_China.txt 2> report.txt
#dsq --job-file failedjobs.txt -o /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Yunnan_China2 
#dsqa failedjobs.txt job_10396305_status.tsv > failedjobs2.txt 2> report.txt
# dsq --job-file failedjobs2.txt -o /dev/null --mem-per-cpu 4g -t 20:00:00 --mail-type ALL --job-name foshan


##############################################
                 Foshan
##############################################


dsq --job-file Foshan.mpile.snpcln.October_20_2020.txt --output /dev/null --mem-per-cpu 6g -t 20:00:00 --mail-type ALL --job-name Foshan 
dsqa Foshan.mpile.snpcln.October_20_2020.txt job_10406738_status.tsv > failedjobsFoshan.txt 2> report.txt
dsq --job-file failedjobsFoshan.txt -o /dev/null --mem-per-cpu 10g -t 20:00:00 --mail-type ALL --job-name foshan2 
dsqa failedjobsFoshan.txt job_10408871_status.tsv > failedjobsFoshan2.txt 2> report.txt
#dsq --job-file failedjobsFoshan2.txt -o /dev/null --mem-per-cpu 20g -t 120:00:00 --mail-type ALL --job-name foshan3
#dsqa failedjobsFoshan2.txt job_10404753_status.tsv > failedjobsFoshan2.txt 2> report.txt


##############################################
                 Torres_Strait_Islands
##############################################


dsq --job-file Torres_Strait_Islands.mpile.snpcln.October_20_2020.txt --output /dev/null --mem-per-cpu 10g -t 20:00:00 --mail-type ALL --job-name Torres_Strait_Islands 
dsqa Torres_Strait_Islands.mpile.snpcln.October_20_2020.txt job_10407048_status.tsv > failedjobsTorres_Strait_Islands.txt 2> report.txt
dsq --job-file failedjobsTorres_Strait_Islands.txt -o /dev/null --mem-per-cpu 10g -t 20:00:00 --mail-type ALL --job-name Torres_Strait_Islands2
dsqa failedjobsTorres_Strait_Islands.txt job_10409688_status.tsv > Torres_Strait_IslandsFailed2.txt 2> report.txt
#dsq --job-file Torres_Strait_IslandsFailed2.txt -o /dev/null --mem-per-cpu 20g -t 120:00:00 --mail-type ALL --job-name Torres_Strait_Islands
#dsqa Torres_Strait_IslandsFailed2.txt job_10404749_status.tsv > Torres_Strait_IslandsFailed3.txt 2> report.txt


##############################################
                 Cali_Colombia
##############################################


dsq --job-file Cali_Colombia.mpile.snpcln.December_1_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Cali_Colombia 
dsqa Cali_Colombia.mpile.snpcln.December_1_2020.txt job_10559838_status.tsv > failedjobs.txt 2> report.txt

##############################################
                 Gelephu_Bhutan
##############################################


dsq --job-file Gelephu_Bhutan.mpile.snpcln.December_1_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Gelephu_Bhutan 
dsqa Gelephu_Bhutan.mpile.snpcln.December_1_2020.txt job_10560143_status.tsv > failedjobs.txt 2> report.txt


##############################################
                 Jaffna_SriLanka
##############################################


dsq --job-file Jaffna_SriLanka.mpile.snpcln.December_1_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Jaffna_SriLanka 
dsqa Jaffna_SriLanka.mpile.snpcln.December_1_2020.txt job_10560263_status.tsv > failedjobs.txt 2> report.txt


##############################################
                 Springfield_MO_USA
##############################################


dsq --job-file Springfield_MO_USA.mpile.snpcln.December_1_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Springfield_MO_USA 
dsqa Springfield_MO_USA.mpile.snpcln.December_1_2020.txt job_10560285_status.tsv > failedjobs.txt 2> report.txt


##############################################
                 New_Orleans
##############################################


dsq --job-file New_Orleans.mpile.snpcln.December_1_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name New_Orleans 
dsqa New_Orleans.mpile.snpcln.December_1_2020.txt job_10560316_status.tsv > failedjobs.txt 2> report.txt


##############################################
                 Tainan_Taiwan
##############################################
dsq --job-file Tainan_Taiwan.mpile.snpcln.December_1_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Tainan_Taiwan 
dsqa Tainan_Taiwan.mpile.snpcln.December_1_2020.txt job_10560349_status.tsv > failedjobs.txt 2> report.txt


##############################################
                 Imperia_Italy
##############################################


dsq --job-file Imperia_Italy.mpile.snpcln.December_5_2020.txt --output /dev/null --mem-per-cpu 5g -t 20:00:00 --mail-type ALL --job-name Imperia_Italy 
dsqa Imperia_Italy.mpile.snpcln.December_5_2020.txt job_10575997_status.tsv > failedjobs.txt 2> report.txt

9. Get sites for genotype calls

In this step I concatenate the sites from the snpCleaner pipeline. Below is an a code used for one population.

# get interactive session
# srun --pty -t 24:00:00 -c 4 --mem=10G -p interactive bash

# load Anaconda
# module load Anaconda2/4.2.0

#concatenate files
from path import Path
d = Path('/ycga-gpfs/project/caccone/lvc26/september_2020/sites/Tainan_Taiwan/sites/keeps/')
with open('/ycga-gpfs/project/caccone/lvc26/september_2020/sites/cat/new_pops/Tainan_Taiwan.txt', 'w') as f_out:
    for file_name in d.walk('*.sites'):
        with open(file_name) as f_in:
            data = f_in.read()
            f_out.write(data)

Next I sorted the sites files. I created a python script to sort the sites files since some had over a billion lines.

Example of batch script to sort files

#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1           
#SBATCH --mem-per-cpu=70gb       
#SBATCH --time=120:00:00                         
#SBATCH --job-name=sort
#SBATCH -o sort.%A_%a.sort.o.csv
#SBATCH -e sort.%A_%a.sort.ERROR.csv
#SBATCH --mail-type ALL

module load Anaconda2/4.2.0

#cd /ycga-gpfs/project/caccone/lvc26/september_2020/sites/cat/new_pops/
cd /gpfs/ycga/project/caccone/lvc26/september_2020/sites/cat/intersects/shared
python sort2.py

The python script

import os
import glob
import pandas as pd

files = glob.glob('/gpfs/ycga/project/caccone/lvc26/september_2020/sites/cat/intersects/shared/*.txt')
for file in files:
    df = pd.read_csv(file, sep='\t', skipinitialspace=True, header=None) # read file
    df.sort_values(by=[0], inplace=True) # sort file
    df.to_csv(file + '_sorted.txt',  index=None, header=None, sep='\t') # write file

The next step was to get an intersection of sites that had good read coverage across all populations. I used a python script to get the intersection.

Example of a batch script submitted. It needed a lot of memory because files had around a billion lines. These sites files are just a text file with two columns. One is the scaffold and the other is the position. We use it later with angsd to do the genotype calls.

#!/bin/bash
#SBATCH --partition=scavenge
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1           
#SBATCH --mem-per-cpu=200gb       
#SBATCH --time=120:00:00                         
#SBATCH --job-name=sort
#SBATCH -o intersection_12.%A_%a.intersection_12.o.log
#SBATCH -e intersection_12.%A_%a.sort.intersection_12.log
#SBATCH --mail-type ALL

module load Anaconda2/4.2.0

cd /gpfs/ycga/project/caccone/lvc26/september_2020/sites/cat

python intersection.py

The python script

import glob

files = glob.glob("*.txt")
combine = set(open(files[0]))

print("Warning: assumes that here is a new line at the end of every file")
print("finding intersection of:")

for file in files:
    print(str(file))
    fp = open(file)
    combine = combine & set(fp)
    fp.close()

with open('results.txt', 'w+') as f:
    for item in combine:
        f.write("%s" % item)

10. Genotype calls using the sites files

Once I obtained the sites that we had good read coverage across all populations, I indexed it with angsd and did a genotype call, exporting as tped format.

#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=20          
#SBATCH --mem-per-cpu=6gb                  
#SBATCH --time=120:00:00 
#SBATCH --array=1-819
#SBATCH --job-name=angsd_chr
#SBATCH -o angsd_chr.%A_%a.o.txt
#SBATCH -e angsd_chr.%A_%a.ERROR.txt

cd /gpfs/ycga/project/caccone/lvc26/september_2020/snp_calls/chunk_calls

samplesheet="scaffolds.txt"

threads=$SLURM_JOB_CPUS_PER_NODE

name=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $1}'`
####r1=`sed -n "$SLURM_ARRAY_TASK_ID"p $samplesheet |  awk '{print $2}'`

/home/lvc26/project/angsd/angsd \
-ref /gpfs/ycga/project/caccone/lvc26/september_2020/genome/aedes_albopictus_LA2_20200826.fasta \
-bam /gpfs/ycga/project/caccone/lvc26/september_2020/snp_calls/bams.txt \
-nThreads 40 \
-r $name \
-gl 1 \
-dopost 1 \
-doMaf 2 \
-doMajorMinor 4 \
-minMapQ 20 \
-minQ 10 \
-remove_bads 1 \
-uniqueOnly 1 \
-sites /gpfs/ycga/project/caccone/lvc26/september_2020/sites/cat/intersects/shared/shared_sites.txt \
-doCounts 1 \
-setMinDepthInd 10 \
-minInd 2 \
-SNP_pval 1e-6 \
-doPlink 2 \
-doGeno 4 \
-capDepth 45 \
-minMaf 0.01 \
-out $name

Then I merged the files

#!/bin/sh
#SBATCH --partition=scavenge
#SBATCH --mail-type=BEGIN,END,FAIL          
#SBATCH --mail-user=luciano.cosme@yale.edu 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10          
#SBATCH --mem-per-cpu=100gb                  
#SBATCH --time=120:00:00 
#SBATCH --job-name=plink_merge
#SBATCH -o plink_merge.%A_%a.o.txt
#SBATCH -e plink_merge.%A_%a.ERROR.txt

module load PLINK/1.90-beta4.4

cd /gpfs/ycga/project/caccone/lvc26/september_2020/snp_calls/chunk_calls/plink
plink --merge-list bed_files.txt --allow-extra-chr --make-bed --out merged --memory 1000000