##### ##### Pipeline for calculating Fws Isolate Scores ##### #### Merging coverage files and SNP List. Localpath contains coverage files and snplist. Coverage files placed into list to be iterated over #### localpath<- covfiles<-list.files(localpath, pattern="PA*") snppositions<-read.table(paste(localpath,, sep=""), header=T) snppositions<-snppositions[,1:3] covoutpath<-paste(localpath,"isolatecoverage/",sep="") dir.create(covoutpath) for (i in 1:length(covfiles)) { currfile<-covfiles[i] currcov<-read.table(paste(localpath,currfile, sep=""), header=T) isoCov<-merge(snppositions, currcov, by.x=c("Chr","Pos","Ref"), by.y=c("Chromosome","Position","Reference"), sort=FALSE, all.x=TRUE) write.table(isoCov, paste(covoutpath, currfile, ".covout", sep=""),col.names=T, row.names=F) } #### Read in SNPList with masking information. List SNPs which need masking #### snppath<- Mask<-read.table(paste(snppath, "MaskedSNPsCol.txt",sep=""),header=TRUE) MaskRemoval<-which(Mask$Call=="N") #### .covout files are coverage files of only SNP positions #### covpath<- < .covout path > covfiles<-list.files(covpath, pattern="PA*") SNPCov<-read.table(paste(covpath,covfiles[1], sep=""), header=TRUE) SNPCov2<-SNPCov SNPCov<-SNPCov[,1:3] #### Produce dataframe with all annotation information, and read information for all four nucleotides of each isolate #### for(i in 1:length(covfiles)){ currisolatecov<-covfiles[i] shortisofile<-sub("-C.covout","",currisolatecov) A<-paste("A",shortisofile,sep="_") C<-paste("C", shortisofile,sep="_") G<-paste("G",shortisofile,sep="_") T<-paste("T",shortisofile,sep="_") isonames<-c(A,C,G,T) isolateXinfo<-read.table(paste(covpath,currisolatecov,sep=""),header=TRUE) names(isolateXinfo)[4:7]<-isonames SNPCov<-cbind(SNPCov,isolateXinfo[,4:7]) } ### Remove masked SNPs from SNP List ### SNPCov<-SNPCov[-MaskRemoval,] Info<-c("Chr","pos","ref") names(SNPCov)[1:3]<-Info isotmpinfo<-SNPCov ### Find and keep only the major and minor population allele ### ### Sum the number of reads at each SNP positions for each nucleotide ### isotmpinfo2<-isotmpinfo isotmpnewRef<-isotmpinfo[,1:3] alist3<-c(1:length(covfiles)) alist3<-alist3*4 clist3<-alist3+1 glist3<-clist3+1 tlist3<-glist3+1 isotmpnewRef$A<-apply(isotmpinfo[,alist3],1,sum) isotmpnewRef$C<-apply(isotmpinfo[,clist3],1,sum) isotmpnewRef$G<-apply(isotmpinfo[,glist3],1,sum) isotmpnewRef$T<-apply(isotmpinfo[,tlist3],1,sum) isotmpref<-isotmpinfo[,1:2] isotmpnon<-isotmpinfo[,1:2] ### Find the major population allele by giving the column name of the major nucleotide (i.e. A,C,G or T) with ###the greatest number of reads to build a major allele sequence across all SNPs ### RefSeq<-as.data.frame(cbind(row.names(isotmpnewRef[,4:7]),apply(isotmpnewRef[,4:7],1,function(x) names(isotmpnewRef[,4:7])[which(x==max(x))]))) isotmpref$RefSeq<-RefSeq[,2] ### Removal of SNPs which have equal read numbers for two alleles ### Check<-which(isotmpref$RefSeq!="A"&isotmpref$RefSeq!="C"&isotmpref$RefSeq!="G"&isotmpref$RefSeq!="T") isotmpref<-isotmpref[-Check,] isotmpnon<-isotmpnon[-Check,] isotmpinfo<-isotmpinfo[-Check,] isotmpnewRef<-isotmpnewRef[-Check,] ### Find the minor population allele by giving the column name of the minor nucleotide ( i.e. A,C,G or T) with the second greatest number of reads to build a minor allele sequence across all SNPs ### maxn<-function(n) function(x) order(x,decreasing=TRUE)[n] NonSeq<-as.data.frame(cbind(row.names(isotmpnewRef[,4:7]),apply(isotmpnewRef[,4:7],1,function(x) names(isotmpnewRef[,4:7])[which(x==x[maxn(2)(x)])]))) isotmpnon$NonSeq<-NonSeq[,2] ### Removal of SNPs which have equal read numbers for two alleles ### BiCheck<-which(isotmpnon$NonSeq!="A" & isotmpnon$NonSeq!="C" & isotmpnon$NonSeq!="G" & isotmpnon$NonSeq!="T") isotmpnon<-isotmpnon[-BiCheck,] isotmpref<-isotmpref[-BiCheck,] isotmpinfo<-isotmpinfo[-BiCheck,] ### Checkpoint ### isotmprefCP<-isotmpref isotmpnonCP<-isotmpnon isotmpinfoCP<-isotmpinfo isotmpinfo2<-isotmpinfo #### Extract number of reads per isolate for the Major and Minor allele at each SNP #### Avec<-c(1:(length(covfiles))*4) Cvec<-Avec+1 Gvec<-Avec+2 Tvec<-Avec+3 isotmpref[,4:(length(covfiles)+3)]<-0 isotmpnon[,4:(length(covfiles)+3)]<-0 ASNPs<-which(isotmpref$RefSeq=="A") CSNPs<-which(isotmpref$RefSeq=="C") GSNPs<-which(isotmpref$RefSeq=="G") TSNPs<-which(isotmpref$RefSeq=="T") isotmpref[ASNPs,4:(length(covfiles)+3)]<-isotmpinfo2[ASNPs,Avec] isotmpref[CSNPs,4:(length(covfiles)+3)]<-isotmpinfo2[CSNPs,Cvec] isotmpref[GSNPs,4:(length(covfiles)+3)]<-isotmpinfo2[GSNPs,Gvec] isotmpref[TSNPs,4:(length(covfiles)+3)]<-isotmpinfo2[TSNPs,Tvec] AaSNPs<-which(isotmpnon$NonSeq=="A") CcSNPs<-which(isotmpnon$NonSeq=="C") GgSNPs<-which(isotmpnon$NonSeq=="G") TtSNPs<-which(isotmpnon$NonSeq=="T") isotmpnon[AaSNPs,4:(length(covfiles)+3)]<-isotmpinfo2[AaSNPs,Avec] isotmpnon[CcSNPs,4:(length(covfiles)+3)]<-isotmpinfo2[CcSNPs,Cvec] isotmpnon[GgSNPs,4:(length(covfiles)+3)]<-isotmpinfo2[GgSNPs,Gvec] isotmpnon[TtSNPs,4:(length(covfiles)+3)]<-isotmpinfo2[TtSNPs,Tvec] ##### ##### Rare Allele Filtering. Removal of SNPs that do not have at least one of A) > 10 reads in 1 isolate, B) A population minor allele frequency of 1% ##### RareFilter<- apply(isotmpnon[,4:(length(covfiles)+3)], 1, function(x) max(x) < 10) RareFilter1<-which(RareFilter==TRUE) isotmpnon$Total<-apply(isotmpnon[,4:(length(covfiles)+3)],1, sum) isotmpref$Total<-apply(isotmpref[,4:(length(covfiles)+3)],1,sum) RareFilter2<-which(isotmpnon$Total<0.01*(isotmpref$Total+isotmpnon$Total)) RareFilterFinal<-RareFilter2[RareFilter2%in%RareFilter1] isotmpnon<-isotmpnon[-RareFilterFinal,] isotmpref<-isotmpref[-RareFilterFinal,] isotmpinfo2<-isotmpinfo2[-RareFilterFinal,] ##### ##### Removal of non-genic SNPs ##### snppath<- snppositions<-read.table(paste(snppath, ,sep=""),header=TRUE) ### Removal of SNPs from original SNP list already filtered out in pipeline ### snppositions<-snppositions[-MaskRemoval,] snppositions<-snppositions[-Check,] snppositions<-snppositions[-BiCheck,] snppositions<-snppositions[-RareFilterFinal,] snpanno<-snppositions[,"gene"] snpchrpos<-snppositions[,1:2] snpanno<-cbind(snpchrpos,snpanno) isotmpnon$snpanno<-snpanno$snpanno isotmpref$snpanno<-snpanno$snpanno ### Removal of non-coding ### NonCod<-which(snpanno$snpanno=="-") isotmpref<-isotmpref[-NonCod,] isotmpnon<-isotmpnon[-NonCod,] isotmpinfo2<-isotmpinfo2[-NonCod,] snpanno<-snpanno[-NonCod,] NonGenicRef<-isotmpref[NonCod,] NonGenicNon<-isotmpnon[NonCod,] isotmpinfo3<-isotmpinfo2[NonCod,] ##### ##### Coverage Filtering: Remove SNPs with a total population coverage outside an approximate Gaussian #### distribution (Median plus/minus 1SD) ##### TotalReads<-isotmpref$Total + isotmpnon$Total CutGen<-which(TotalReads<=(median(TotalReads)+sd(TotalReads)) & TotalReads>=(median(TotalReads)-sd(TotalReads))) isotmpnon<-isotmpnon[CutGen,] isotmpref<-isotmpref[CutGen,] isotmpinfo2<-isotmpinfo2[CutGen,] snpanno<-snpanno[CutGen,] ### Checkpoint ### isotmpnon2<-isotmpnon isotmpref2<-isotmpref isotmpinfo2orig<-isotmpinfo2 isotmpnon<-isotmpnon2 isotmpref<-isotmpref2 isotmpinfo2<-isotmpinfo2orig ##### ##### Add coverage cut offs per SNP and per isolate ##### isorefcounts<-isotmpref[,4:(length(covfiles)+3)] isononcounts<-isotmpnon[,4:(length(covfiles)+3)] ### All individual alleles with coverage <5 turned to 0 ### isorefcounts[isorefcounts<=4]=0 isononcounts[isononcounts<=4]=0 isorefcounts2<-isorefcounts isononcounts2<-isononcounts ### All positions within each isolate of coverage <20 classed as missing data ### isorefcounts[(isorefcounts2+isononcounts2)<20]=NA isononcounts[(isorefcounts2+isononcounts2)<20]=NA ##### ##### Removal of samples due to missingness; Samples with more than 20% of SNPs having missing data removed. ##### c.s=colSums(is.na(isorefcounts)) good.samples=which(c.s<0.2*(nrow(isorefcounts))) isorefcounts<-isorefcounts[,good.samples] isononcounts<-isononcounts[,good.samples] ##### ##### Removal of SNPs due to missingness; SNPs missing at >10% of samples removed ##### r.s = rowSums(is.na(isorefcounts[,1:length(good.samples)])); Miss.SNPs<-which(r.s>=0.1*(length(good.samples))) isorefcounts<-isorefcounts[-Miss.SNPs,] isononcounts<-isononcounts[-Miss.SNPs,] isotmpinfo2<-isotmpinfo2[-Miss.SNPs,] snpanno<-snpanno[-Miss.SNPs,] ##### ##### Make allele frequency (proportions) data frame for major and minor allele ##### isoMjfrac<-isorefcounts/(isorefcounts+isononcounts) isoMnfrac<-isononcounts/(isorefcounts+isononcounts) isoMjfrac$Popln<-apply(isoMjfrac,1,function(x) mean(x,na.rm=TRUE)) isoMnfrac$Popln<-apply(isoMnfrac,1,function(x) mean(x,na.rm=TRUE)) ### Removal of SNPs no longer SNPs following filtering AbsentSNPs<-which(isoMnfrac$Popln==0) isoMjfrac<-isoMjfrac[-AbsentSNPs,] isoMnfrac<-isoMnfrac[-AbsentSNPs,] isorefcounts<-isorefcounts[-AbsentSNPs,] isononcounts<-isononcounts[-AbsentSNPs,] isotmpinfo2<-isotmpinfo2[-AbsentSNPs,] snpanno<-snpanno[-AbsentSNPs,] ### SNPS which have changed from minor to major or vice versa following filtering are reversed ### SNPReversal<-which(isoMnfrac$Popln>0.5) isoMjfrac2<-isoMjfrac isoMjfrac[SNPReversal,]<-isoMnfrac[SNPReversal,] isoMnfrac[SNPReversal,]<-isoMjfrac2[SNPReversal,] ##### ##### Calculation of Hw and Hs for each SNP ##### Hwinfo<-2*isoMnfrac[,1:(length(good.samples))]*isoMjfrac[,1:(length(good.samples))] Hsinfo<-2*isoMnfrac$Popln*isoMjfrac$Popln Hsinfo<-as.data.frame(Hsinfo) Hwinfo<-as.data.frame(Hwinfo) ##### ##### Binning of SNPs into MAF intervals. SNPs are binned by population minor allele frequency (into 5% intervals). ##### cutMAFs2<-cut(isoMnfrac$Popln, seq(0,0.5,by=0.05)) Hwinfo2<-Hwinfo Hsinfo2<-Hsinfo Hwinfo2$Bin<-cutMAFs2 Hsinfo2$Bin<-cutMAFs2 ### Calculation of Hs across the population (so ONCE) for each interval. EntireHs<-tapply(Hsinfo2[,1], cutMAFs2, mean, na.rm=TRUE) EntireIntervals<-as.data.frame(EntireHs) EntireHwnames<-colnames(Hwinfo2[,1:(length(good.samples))]) ### Calculation of Hw for each isolate for each interval. for (i in 1:(length(good.samples))){ isoHwID<- EntireHwnames[i] EntireHw<- tapply(Hwinfo2[,isoHwID], cutMAFs2, mean, na.rm=TRUE) EntireHw<- as.data.frame(EntireHw) names(EntireHw)[1]<-isoHwID EntireIntervals<-cbind(EntireIntervals, EntireHw) } ### Fit a linear regression slope and take the coefficient to determine the slope of the line (Hw/Hs) for each isolate fit<-lm(EntireIntervals[,2]~ 0 + EntireIntervals[,1]) Entisoratio1<-as.vector(fit$coefficients[1]) for (i in 2:(length(good.samples))){ isoHwID<-EntireHwnames[i] fit<-lm(EntireIntervals[,i+1]~ 0 + EntireIntervals[,1]) Entisoratio<-as.vector(fit$coefficients[1]) Entisoratio1<-cbind(Entisoratio1,Entisoratio) } ### Get isolate IDs and calculate Fws as 1-Entisoratio1 scores. Entisoratio1 contains Hw/Hs. IDs<- names(isotmpinfo2[,Avec]) IDs<-sub("-C.coverage.covout","",IDs) IDs<-sub("A_","",IDs) IDs<-IDs[good.samples] PopulationFws<-1-Entisoratio1 names(PopulationFws)<-IDs plot(PopulationFws,xlab="Isolate ID",ylab="F"["ws"]~ "Score", ylim=c(0,1),xlim=c(0,(length(good.samples))) abline(h=0.95)