# A script to analyse the phytoplankton community of Mwanza Gulf, Lake Victoria

# Set path for packages
.libPaths('C:/Software/R')

# Set working directory
setwd(dirname(rstudioapi::getSourceEditorContext()$path))

# Empty the environment
remove(list=ls(all=TRUE))  #remove all objects

library(tidyverse)

#### 1. Phytoplankton composition calculations ----

# Lake Victoria phytoplankton taxonomic information
VictPhytoTaxonomy <- read.csv(file="VictPhytoTaxonomy.csv", header=T)
str(VictPhytoTaxonomy)

# Lake Victoria phytoplankton raw data
VictPhytoRaw <- read.csv(file="VictPhytoRaw.csv", header=T)
str(VictPhytoRaw)

# Calculate densities per species season and station
# Correction factors
PhytoVictoriaFactors <- read.csv(file="PhytoVictoriaFactors.csv", header=T)
str(PhytoVictoriaFactors)

dropvol <- 0.05336 # Volume of one sample drop (mL)
concentr <- 20 # Concentration factor from lake water to sample
samplesurf <- 400 # Surface area of counted sample (mm^2)

VictPhytoDens1 <- VictPhytoRaw %>% count(ID_NR, Season, Station, Subsamp) %>%
                  full_join(PhytoVictoriaFactors, by=c("Station","Season","Subsamp")) %>% mutate(nCor=n/Surface_mm2) %>%
                  group_by(ID_NR, Season, Station) %>% summarise(Density=round(sum(nCor)*samplesurf/dropvol/concentr,0)) %>%
                  inner_join(VictPhytoTaxonomy[,c("ID_NR","Species")], by="ID_NR") %>%
                  arrange(Season, Station, ID_NR)

# Create full table with densities
PhytoComp1 <- pivot_wider(VictPhytoDens1, names_from = c(Season,Station), values_from = Density, values_fill = 0) %>%
              left_join(VictPhytoTaxonomy[,c("ID_NR", "Phylum")]) %>%
              arrange(Phylum, Species)
# write.csv(PhytoComp1, file="QuantitativeTable1.csv")

#### 2. Producing tables for the manuscript ----

# Table 1: presence-absence table of phytoplankton taxa

# Replace numbers>0 with "+"
PhytoComp2 <- VictPhytoDens1
PhytoComp2$Presence <- "+"
# Create presence-absence table
PhytoComp3 <- select(PhytoComp2,!Density) %>% pivot_wider(names_from = c(Season,Station), values_from = Presence, values_fill = "-") %>% 
                                              arrange(ID_NR)
# write.csv(PhytoComp3, file="Table1.csv")

# Table 2: relative densities of the most abundant genera
# Overall cumulative percentages (including unidentified species!)
gen_comp1 <- right_join(select(VictPhytoTaxonomy, ID_NR, Genus, Phylum), PhytoComp1) %>% 
             select(!ID_NR)
gen_comp1$Phylum <- ifelse(gen_comp1$Genus!="",gen_comp1$Phylum,"Unidentified")

cumperc_cutoff1 <- 95 # Cut-off values for cumulative percentages
gen_comp2 <- pivot_longer(gen_comp1,Dry_S1:Rainy_S3, names_to = "Sample", values_to = "Density") %>%
             group_by(Genus) %>% summarise(Density_t=sum(Density)) %>%
             arrange(desc(Density_t)) %>% mutate(cumperc=100*cumsum(Density_t)/sum(Density_t)) %>%
             mutate(perc_tot=100*Density_t/sum(Density_t)) %>% filter(cumperc<=cumperc_cutoff1+1) %>% 
             inner_join(select(gen_comp1, c(Genus, Phylum))) %>% distinct()

# (Cumulative) percentages per season & station (including unidentified species!)
gen_comp3 <- pivot_longer(gen_comp1,Dry_S1:Rainy_S3, names_to = "Sample", values_to = "Density") %>%
             group_by(Sample, Genus) %>% summarise(Density_t=sum(Density)) %>%
             arrange(Sample, desc(Density_t)) %>% mutate(perc=100*Density_t/sum(Density_t)) %>%
             select(Sample, Genus, perc) %>%
             pivot_wider(names_from = Sample, values_from = perc, values_fill = 0) %>%
             inner_join(select(gen_comp2, Genus, Phylum, perc_tot), by="Genus") %>% arrange(desc(perc_tot)) %>%
             relocate(Phylum, .after=Genus) %>% arrange(Phylum=="Unidentified")
# write.csv(gen_comp3, file="Table2.csv")

# Table 3: absolute densities of the most abundant species
spec_comp1 <- right_join(select(VictPhytoTaxonomy, ID_NR, Species, Genus, Phylum), PhytoComp1) %>% 
              select(!ID_NR) 
spec_comp1$Species <- ifelse(spec_comp1$Genus!="",spec_comp1$Species,"Unidentified")
spec_comp1$Phylum <- ifelse(spec_comp1$Genus!="",spec_comp1$Phylum,"Unidentified")

spec_comp2 <- pivot_longer(spec_comp1,Dry_S1:Rainy_S3, names_to = "Sample", values_to = "Density") %>%
              group_by(Species) %>% summarise(Density_avg=mean(Density)) %>%
              arrange(desc(Density_avg)) %>% mutate(cumperc=100*cumsum(Density_avg)/sum(Density_avg)) %>%
              mutate(perc_tot=100*Density_avg/sum(Density_avg)) %>% filter(Density_avg>=1000) %>%
              filter(Species!="Unidentified") %>% left_join(spec_comp1) %>% 
              select(-c(cumperc, Genus)) %>% relocate(Phylum, .after=Species) %>% 
              relocate(Density_avg, .after=Rainy_S3) %>% relocate(perc_tot, .after=Density_avg)
# write.csv(spec_comp2, file="Table3.csv")

# Table 4: Densities of Cyanobacteria with heterocytes
heterocyte1 <- filter(spec_comp1, Genus %in% c("Anabaenopsis","Aphanizomenon","Cylindrospermopsis","Dolichospermum")) %>%
               select(-c(Genus, Phylum))
# write.csv(heterocyte1, file="Table4.csv")

#### 3. Producing figures for the manuscript ----

#### Figure 1: Map of Mwanza Gulf ----
# In a separate script: LakeVictoriaMap.R

#### Figure 2: Chlorophyll-concentrations in Mwanza Gulf ----
VictChlorophyll <- read.csv(file="VictChlorophyll.csv", header=T)
str(VictChlorophyll)
chloro_victoria3 <- VictChlorophyll %>% group_by(Season, Station) %>%
                    summarise(Chl_mean=mean(Chlorophyll), Chl_std=sd(Chlorophyll))
# write.csv(chloro_victoria3, file="Fig2.csv")

# Figure 3: Numbers of individuals per main phytoplankton group
phyto_maingroup1 <- pivot_longer(spec_comp1,Dry_S1:Rainy_S3, names_to = "Sample", values_to = "Density") %>%
                    select(-c(Species, Genus)) %>% filter(Phylum != "Unidentified") %>%
                    group_by(Sample, Phylum) %>% summarise(Density_t=sum(Density)) %>%
                    pivot_wider(names_from = Phylum, values_from = Density_t, values_fill = 0)
# write.csv(phyto_maingroup1, file="Fig3.csv")

# Figure 4: Mean numbers of individuals per season and morphological/functional group
phyto_MFG1 <- VictPhytoDens1 %>% full_join(VictPhytoTaxonomy, by="ID_NR") %>% 
              select(ID_NR, Season:Density, Phylum, ColoniesNoMucus:SingleCells) %>%
              rowwise() %>% mutate(tot_form=sum(c_across(ColoniesNoMucus:SingleCells))) %>%
              ungroup() %>% select(-ID_NR) %>%
              pivot_longer(ColoniesNoMucus:SingleCells, names_to = "Form", values_to = "Form_val") %>%
              mutate(Density_c=Density*Form_val/tot_form) %>%
              select(Season, Station, Phylum, Form, Density_c) %>% group_by(Season, Station, Phylum, Form) %>%
              summarise(Density_t=sum(Density_c)) %>%
              group_by(Season, Phylum, Form) %>% summarise(Density_m=mean(Density_t)) %>% 
              filter(Phylum == "Chlorophyta" | Phylum=="Cyanobacteria") %>%
              pivot_wider(names_from=Phylum, values_from = Density_m, values_fill = 0)
# write.csv(phyto_MFG1, file="Fig4.csv")

# Alternatively: directly read from external dataset (not based on taxonomy)
# VictPhytoForm <- read.csv(file="VictPhytoForm.csv", header=T)
# str(VictPhytoForm)
# phyto_MFG1 <- pivot_longer(VictPhytoForm,Colonies_no_mucus:Cells_single, names_to = "Form", values_to = "Density") %>%
#              group_by(Season, Phylum, Form) %>% summarise(Density_t=mean(Density)) %>% 
#              filter(Phylum == "Chlorophyta" | Phylum=="Cyanobacteria") %>%
#              pivot_wider(names_from=Phylum, values_from = Density_t, values_fill = 0)
# write.csv(phyto_MFG1, file="phyto_MFG1.csv")

# Figure 5: Size structure of individuals per season and station
# Note that 1 mmu = 2.93 m!
phyto_size1 <- VictPhytoRaw %>% mutate(SizeClass=round(log2(Length_mmu*2.93),0)) %>% drop_na() %>%
               count(SizeClass, Season, Station, Subsamp) %>%
               full_join(PhytoVictoriaFactors, by=c("Station","Season","Subsamp")) %>% mutate(nCor=n/Surface_mm2) %>%
               group_by(SizeClass, Season, Station) %>% 
               summarise(Density=round(sum(nCor)*samplesurf/dropvol/concentr,0)) %>%
               arrange(Season, Station, SizeClass) %>% group_by(Season, Station) %>% 
               mutate(perc_dens=100*Density/sum(Density)) %>% select(-Density) %>%
               pivot_wider(names_from = c(Season,Station), values_from = perc_dens, values_fill = 0) %>% 
               arrange(SizeClass) %>% column_to_rownames(var="SizeClass")
# write.csv(phyto_size1, file="Fig5.csv")

#### 4. Statistics ----

# First simple ANOVA of chlorophyll data
# https://www.scribbr.com/statistics/anova-in-r/
anova_chloro1 <- aov(Chlorophyll~Station*Season, data=VictChlorophyll)
summary(anova_chloro1)

# Make Season-Station combinations
chloro_victoria2 <- VictChlorophyll
chloro_victoria2$Season_Station <- paste(chloro_victoria2$Season,chloro_victoria2$Station, sep="_")

# Simple ANOVA
anova_chloro2 <- aov(Chlorophyll~Season_Station, data=chloro_victoria2)
summary(anova_chloro2)
par(mfrow=c(2,2))
plot(anova_chloro2, ask=F)
par(mfrow=c(1,1))

# Pairwise comparisons
par(mar=c(5,10,5,5))
plot(TukeyHSD(anova_chloro2), las=1)
# dev.off()

# Chi-square analysis of size distributions
# Make contingency table
# Note that 1 mmu = 2.93 m!
phyto_size2 <- VictPhytoRaw %>% mutate(SizeClass=round(log2(Length_mmu*2.93),0)) %>% drop_na() %>%
               count(SizeClass, Season, Station, Subsamp) %>%
               full_join(PhytoVictoriaFactors, by=c("Station","Season","Subsamp")) %>% mutate(nCor=n/Surface_mm2) %>%
               group_by(SizeClass, Season, Station) %>% 
               summarise(Density=round(sum(nCor),0)) %>%
               arrange(Season, Station, SizeClass) %>% group_by(Season, Station) %>% 
               pivot_wider(names_from = c(Season,Station), values_from = Density, values_fill = 0) %>% 
               arrange(SizeClass) %>% column_to_rownames(var="SizeClass") %>%
               t() %>% as.data.frame() %>%  mutate(`>=8`=`8`+`9`+`10`) %>%
               select(-c(`8`,`9`,`10`)) %>% t()

# Overall chi-square test
test <- chisq.test(phyto_size2, simulate.p.value=T)
test

# Pairwise Chi-square tests
library(rcompanion)
pairwiseNominalIndependence(as.matrix(phyto_size2), compare="column", fisher=T, gtest=F, chisq=F,
                            simulate.p.value=T, method = "holm")
