##########################################################
# Project: bibliometric analysis of drought indicators
# across the world
#
# Purpose: process .bib files retrieved from Scopus, 
# extract relevant information using the bibliometrix
# package, and save it as data.frames.
#
# Authors: N. Addor and L. Melsen 
##########################################################

# initialization
rm(list = ls())

# Set encoding as "UTF-8" to fix error in loading data
options(encoding="UTF-8")

# load libraries
require('bibliometrix')

# set working directory

path="//WURNET.NL/Homes/drought indicators and impacts/scripts"
setwd(path)

# load the list with country names
# Note; these are the lists with words that you want to search within each article 
# So not the Scopus search-words. For instance, in Scopus you search on SPI. Then you store the results.
# And with the list of countries, you check how often which country is mentioned in all the SPI papers
source('./dtb/countrydtb.R')               # list of countries and adjectives
source('./dtb/countrysinsdtb.R')           # different synonyms of country names that are used (e.g. US versus USA)
source('./dtb/multiwordlistdtb.R')         # list of countries consisting of multiple words
source('./dtb/continentdtb.R')             # continents
source('./dtb/countrycontinentdtb.R')      # continent in which each country lies


#####################
# DATA-PREPARATION  #
#####################

# This list-indicator is a list of all the bibliometric databases that we downloaded - the Scopus search terms. 
# We had 34 databases, for 34 different drought indicators and impacts. 
list_indicator = list("[drought indicator or impact]")  #"SMDI","VegDRI" ,"ETDI" , "CMI","SWDI" ,"PZI" , "SAVI","ESI" , "VHI","SMA" ,"VCI" ,"EVI" ,"NDVI_20_21","NDVI_19","SWS" , , ) #"deciles","PDSI",,"KBDI","RAI","spi_2019_2021","SPEI","PDSI","spi_2018","MD") #,"SDI" ,"SWI" ,"SSFI" ,"SRI" , "streamflow_anomalies", "PHDI","SWSI" )  # 
 


# create data structures
M<-list() # this is the dataframe in which the bibliometric data is loaded
biban<-list() # this is the dataframe that contains the default analysis on bibliometric data as provided by the package
indicators_df<-data.frame(row.names=list_indicator) # key statistics on each indicator (top countries and authors) 
papers_df<-data.frame() # specific data on each paper - this is what you are interested in, in the end
paper_counter<-1 # a counter to keep track of the total number of papers


for (ind in list_indicator){ # loop through drought indicators (currently only one, namely SPI)
  
  ################################
  ################################
  #REMOVE THIS SENTENCE THIS IS ONLY FOR TESTING
  #ind=list_indicator[10]
  ################################
  ################################
  
  
  #########################
  # LOAD AND PREPARE DATA #
  #########################
  print(ind)

  # load Scopus database with literature entries
  file = paste0(path,'/scopus_dtb/',ind,'.bib') #loads the scopus .bib dataset selected

  # process data
  M[[ind]]<-convert2df(file,dbsource="scopus",format="bibtex")
  M[[ind]]<-metaTagExtraction(M[[ind]],Field = "AU_UN", sep = ";") # extract affiliations and add them to M[[ind]] 
  M[[ind]]<-metaTagExtraction(M[[ind]],Field = "AU_CO", sep = ";") # extract countries of affiliations and add them to M[[ind]]
  
  # perform author/publication analysis (default bibliometrix analysis)
  biban[[ind]]<-biblioAnalysis(M[[ind]],sep=";")
  
  #########################################
  # COLLECT KEY INDICATORS FOR EACH MODEL #
  #########################################
  # the analysis below is more on the meta-information of the papers.
  # This is done based on default analysis of the bibliometrix package
  # Might not use this information, but it can come in handy 
  # For instance, the year that the first paper appeared with this indicator shows when an indicator was developed
  
  indicators_df[ind,'num_papers']<-length(biban[[ind]]$TotalCitation)       # number of papers in the database
  indicators_df[ind,'mean_num_citations']<-mean(biban[[ind]]$TotalCitation) # mean number of citations
  indicators_df[ind,'year_first_paper']<-min(biban[[ind]]$Years)            # year that first paper appeared with this search term
  #indicators_df[ind,'year_last_paper']<-max(biban[[ind]]$Years)            # year that last paper appeared with this search term
  
  # tranform tables to data.frames as they're easier to manipulate
  ind_country<-as.data.frame(biban[[ind]]$Countries)
  ind_author<-as.data.frame(biban[[ind]]$Authors)
  
  # retrieve top 3 countries and author for each model
  for(i in 1:3){
    indicators_df[ind,paste0('top_country_',i)]<-paste(as.character(ind_country[i,1]),as.character(ind_country[i,2])) 
  }
  
  for(i in 1:3){
    indicators_df[ind,paste0('top_author_',i)]<-paste(as.character(ind_author[i,1]),as.character(ind_author[i,2])) 
  }
  
  ###########################################
  # COLLECT INFO FOR EACH PAPER 
  # SEE THE END OF THIS SCRIPT FOR A SUMMARY 
  ###########################################
  
  ind_num_papers<-indicators_df[ind,'num_papers'] # number of papers for this indicator
  
  # extract abstract ant title for this model
  wordextract_ab =  termExtraction(M[[ind]], Field = "AB",remove.numbers=TRUE,remove.terms=NULL,keep.terms=multiwordlist, verbose=F) #stores the asbtract to use it after
  wordextract_ti =  termExtraction(M[[ind]], Field = "TI",remove.numbers=TRUE,remove.terms=NULL,keep.terms=multiwordlist, verbose=F)
  wordextract_ky =  termExtraction(M[[ind]], Field = "DE",remove.numbers=TRUE,remove.terms=NULL,keep.terms=multiwordlist, verbose=F) #modif Sarra

  attach(biban[[ind]])
  
  for (p in 1:ind_num_papers){ # loop through papers for this indicator
    
    # AU Authors 
    papers_df[paper_counter,'AU']<-M[[ind]]$AU[p]
    
    # YR  Year of publication
    papers_df[paper_counter,'YR']<-Years[p] 
    
    # JN  Journal of publication
    papers_df[paper_counter,'JN']<-M[[ind]]$SO[p]
    
    # TI Title
    papers_df[paper_counter,'TI']<-M[[ind]]$TI[p]
  
    # ICF Country institute first author
    papers_df[paper_counter,'ICF']=CO[p]
    
    # alternative: 
    # dum  = strsplit(M[[mod]]$AU_CO, ";")[p]
    # papers_df[paper_counter,'ICF']= dum[[1]][1]         
    
    # CT1-3 + COT1  Application country 1-3 and continent of first country
    # Count how often each country is mentioned, choose top 3 countries (if equal, alphabetical order)
    # Furthermore, store the continent of the first country
    
    wordcount = matrix(nrow=dim(countrylist)[1],ncol=1) 
    rep_ab    = wordextract_ab$AB_TM[p] #removes connection words
    rep_ti    = wordextract_ti$TI_TM[p]
    rep_ky    = wordextract_ky$DE_TM[p]
    
    for (j in 1:dim(countrysyns)[1]){ # country synonyms loop
      rep_ab = gsub(toString(countrysyns[j,2]),rep_ab,ignore.case=TRUE,replacement=countrysyns[j,1]) 
      rep_ti = gsub(toString(countrysyns[j,2]),rep_ti,ignore.case=TRUE,replacement=countrysyns[j,1])
      rep_ky = gsub(toString(countrysyns[j,2]),rep_ky,ignore.case=TRUE,replacement=countrysyns[j,1])
    }
    
  
    for (k in 1:dim(countrylist)[1]){  # country loop
      dum1 = grep(toString(countrylist[k,1]),rep_ab,ignore.case=TRUE) 
      dum2 = grep(toString(countrylist[k,2]),rep_ab,ignore.case=TRUE)
      dum3 = grep(toString(countrylist[k,1]),rep_ti,ignore.case=TRUE) 
      dum4 = grep(toString(countrylist[k,2]),rep_ti,ignore.case=TRUE)
      dum5 = grep(toString(countrylist[k,1]),rep_ky,ignore.case=TRUE) 
      dum6 = grep(toString(countrylist[k,2]),rep_ky,ignore.case=TRUE)
      wordcount[k]=length(dum1)+length(dum2)+length(dum3)+length(dum4)+length(dum5)+length(dum6)
    } 
    
 
    # countryloop
    s = sort (wordcount,decreasing=T,index.return=T) #sort how many times the country is mentionned. to have a top 3
    if (s$x[1]>0){
      papers_df[paper_counter,'CT1']  = countrylist[s$ix[1],1]
      papers_df[paper_counter,'COT1'] = countrycontinentlist[s$ix[1],2]
    }
    if (s$x[2]>0){
      papers_df[paper_counter,'CT2'] = countrylist[s$ix[2]]
    }
    if (s$x[3]>0){
      papers_df[paper_counter,'CT3'] = countrylist[s$ix[3]]
    }
    
    rm(wordcount) #jusqua ici
    
    
    
    # COT2: continents mentioned in the abstract #and the title and keywords
    wordcount = matrix(nrow=length(continentlist),ncol=1)
    for (k in 1:dim(continentlist)[1]){  # continent loop
      dum1 = grep(toString(continentlist[k,1]),rep_ab,ignore.case=TRUE) 
      dum2 = grep(toString(continentlist[k,2]),rep_ab,ignore.case=TRUE)
      dum3 = grep(toString(continentlist[k,1]),rep_ti,ignore.case=TRUE) 
      dum4 = grep(toString(continentlist[k,2]),rep_ti,ignore.case=TRUE)
      dum5 = grep(toString(continentlist[k,1]),rep_ky,ignore.case=TRUE) 
      dum6 = grep(toString(continentlist[k,2]),rep_ky,ignore.case=TRUE)
      wordcount[k]=length(dum1)+length(dum2)+length(dum3)+length(dum4)+length(dum5)+length(dum6)
    } # continent list loop
    s = sort (wordcount,decreasing=T,index.return=T)
    if (s$x[1]>0){
      papers_df[paper_counter,'COT2']= continentlist[s$ix[1]] #if any continent is mentionned in the abstract
    }
    rm(wordcount)

 
     rm(wordcount)
    
    # increment counter
    paper_counter<-paper_counter+1
    
  }
  
  detach(biban[[ind]]) 
  
} 

# add unique ID - will be used to exclude papers
papers_df<-data.frame(ID=1:dim(papers_df)[1],papers_df)

# transform characters to factors
papers_df[] <- lapply(papers_df, factor)

### SHOW RESULTS
print(indicators_df)
summary(papers_df)
dim(papers_df)

#save summary of results in new folder
path_save = paste0(path,'/drought_indicator_or_impact/')
setwd(path_save)

sink("sumup_drought_indicator_or_impact.txt")
print(summary(papers_df))
sink()

sink("output_drought_indicator_or_impact.txt")
print((indicators_df))
sink()

sink("drought_indicator_or_impact.txt")
print(summary(papers_df$COT1))
sink()

sink("drought_indicator_or_impact.txt")
print(summary(papers_df$CT1))
sink()

### SAVE RESULTS
save.image(file="drought_indicator_or_impact_processed.Rdata")

### COLUMNS FOR DATA FRAME papers_df
# check colnames(papers_df)
# ID  Unique paper ID
# YR  Year of publication
# AU  Authors
# YR  Year of publication
# JN  Journal of publication
# TI  Title
# ICF Country institute first author
# CT1  Application country 1
# CT2  Application country 2
# CT3  Application country 3
# COT1 Continent 1
# COT2 Continent 2



##########################################################
# Project: bibliometric analysis of drought related studies around the world
#
# Purpose: screen papers and rank countries according to number of publications
#
##########################################################

# initialization
rm(list = ls())

# set working directory
setwd("//WURNET.NL/Homes/drought indicator or impact/")

# load data
load(file='drought_indicator_or_impact_processed.Rdata')

# load list of countries and indicators
source('dtb/countrydtb.R') 
source('dtb/indicatorslist.R') 


### which countries were the papers about?
table_countries<-as.data.frame(table(papers_df$CT1))
table_countries<-table_countries[order(table_countries$Freq,decreasing=TRUE),]
table_countries<-data.frame(table_countries,Inc='NO')
write.csv(table_countries,paste0('drought indicator or impact/drought indicator or impact_all.csv'))
