## Written by Vassilis Kostakos
## University of Oulu
## 2014
##
## If you use this code please refer to:
##
## Liu, Y., Goncalves, J., Ferreira, D., Xiao, B., Hosio, S., & Kostakos, V. (2014). 
## CHI 1994-2013: Mapping two decades of intellectual progress through co-word analysis. 
## Proc. CHI, Toronto, Canada, 3553-3562
## and 
## Liu, Y., Goncalves, J., Ferreira, D., Hosio, S., & Kostakos, V. (2014). Identity 
## Crisis of Ubicomp? Mapping 15 Years of the Field’s Development and Paradigm Change. 
## Proc. Ubicomp.
##
##
##


library(tm)
library(FactoMineR)
library(igraph)
library(ggplot2)

#############
####SETTINGS
#############
##Which years we want to focus on (from, to)
###Note: Correspondence analysis needs at least 3 years
MIN_YEAR = 2008
MAX_YEAR = 2013
##How many clusters
NUM_CLUSTERS = 13
##Ignore keywords with too low or too high occurrence
MIN_FREQ = 76
MAX_FREQ = 10000
##In the keyword graph, show only strong edges (over threshold)
STRONG_EDGE = 25

### For the period 1999-2007 we used the following settings:
#MIN_YEAR = 1999
#MAX_YEAR = 2007
#NUM_CLUSTERS = 12
#MIN_FREQ = 15
#MAX_FREQ = 10000
#STRONG_EDGE = 25

#read the data
t<-read.table("input.txt", sep="\t", header=TRUE)

#convert to matrix
m<-as.matrix(t)
#Focus on the desired period:
m <- subset(m, m[,"YEAR"] >= MIN_YEAR)
m <- subset(m, m[,"YEAR"] <= MAX_YEAR)
#get rid of documents with no keywords
m <- subset(m, m[,"KEYWORDS"] != "")


#plot number of papers per year
pdf("fig_01_histogram.pdf")
plot(factor(m[,"YEAR"]))
dev.off()

######
#plot correspondence analysis
######
years<-levels((factor(m[,"YEAR"])))
allkeywords <- years

 for(i in 1:length(years)) {
	allkeywords[i] <- paste(subset(m, m[,"YEAR"] == years[i])[,"KEYWORDS"], collapse=" ")
        
    }
myCorpus <- Corpus(VectorSource(allkeywords))
tdm <- as.matrix(TermDocumentMatrix(myCorpus))
colnames(tdm) <- years
pdf("fig_02_correspondance.pdf")
par(cex=0.5,font=1)
plot(CA(tdm),autoLab="yes", col.row="red", col.col="blue")
dev.off()


###Save table 1
tab1 <- cbind(years, as.vector(table(m[,"YEAR"])), colSums(tdm))
colnames(tab1) <- c("Year", "Total_papers", "Total_Keywords")
write.table(file="tab_01_statistics.txt", tab1, quote=FALSE, sep="\t")



#####
##Calculate clusters of keywords
#####

#convert to "Corpus" object
#each document (i.e. paper) has an arbitrary number of keywords
myCorpus <- Corpus(VectorSource(m[,"KEYWORDS"]))


#calculate the Term-Document matrix and co-word matrix
#shows how many times a keyword appears in each document
tdm <- as.matrix(TermDocumentMatrix(myCorpus))
#remove one keyword -- decision to hardcode this.
tdm <- subset(tdm, rownames(tdm) != "ubiquitouscomputing")


### Generate graph of keywords co-occurance
g<-graph.incidence(tdm)
g<-bipartite.projection(g)
cowords <- as.matrix(get.adjacency(g$proj1, attr="weight"))

#get rid of keywords with too low or too high co-occurance frequency
#update the document-term matrix, co-word matrix, and regenerate co-word graph
to_keep <- (rowSums(cowords) >= MIN_FREQ) & (rowSums(cowords) <= MAX_FREQ)
cowords <- subset(cowords, to_keep)
cowords <- t(cowords)
cowords <- subset(cowords, to_keep)
tdm <- subset(tdm, to_keep)
g<-graph.incidence(tdm)
g<-bipartite.projection(g)


#Get the hierarchical clustering from term-document matrix
hc <- hclust(dist(cor(t(tdm)),method="euclidean"), method="ward.D2")


pdf("fig_03_clusters.pdf")
par(cex=0.7,font=1)
plot(hc, hang = -1)
#plot(hc)
plotted_clusters <- rect.hclust(hc, k= NUM_CLUSTERS, border="red")
dev.off()

#Now get the clusters from the dendrogram, and apply
#them to the co-word graph so we can calculate centrality 
clusters <- as.data.frame(cutree(hc,k= NUM_CLUSTERS))

#now we need to re-assign cluster numbers, so that
#the order of clusters as printed on the plot is the
#same as the numbering used in the strategic diagram.
#We could skip the next few lines without a problem
for(i in 1:length(plotted_clusters)){
  match <- rownames(as.data.frame(clusters))  %in% rownames(as.data.frame(plotted_clusters[[i]]))
  clusters[,1][match] <- i
}


#Assign cluster ID to each node in the graph
V(g$proj1)$cluster <- as.numeric(clusters[,1])
gg <- g$proj1
V(gg)$name <- paste(V(gg)$name,V(gg)$cluster,sep="_")
gg <- delete.edges(gg, E(gg)[E(gg)$weight <= STRONG_EDGE])
pdf("fig_04_keyword_graph.pdf")
par(cex=0.7,font=1)
plot(gg, layout=layout.fruchterman.reingold, vertex.size=5, vertex.color=V(gg)$cluster)
dev.off()


strategic <- NULL
description <- NULL
for (i in 1: NUM_CLUSTERS){
  #extract cluster
  c <- induced.subgraph(g$proj1, V(g$proj1)$cluster == i)
  keywords <- V(c)$name
  
  #calculate total frequency of cluster
  total_freq<-sum(rowSums(tdm)[(rownames(tdm) %in% keywords)])

  #calculate total co-word frequency of cluster
  total_cwfreq<-sum(rowSums(cowords)[(rownames(cowords) %in% keywords)])
  
  
  #calculate density
  nodecount <- length(V(c))
  possible_edges <- (nodecount * (nodecount-1))/2
  edgecount <- sum(E(c)$weight)
  density <- edgecount / possible_edges
  
  #withi the broader network, see how many nodes the cluster can connect to, 
  #and see how many it actually connects to
  networksize <- length(V(g$proj1))
  potential_ties <- networksize - nodecount
  
  #see who this cluster is connected to
  t<-g$proj1[V(g$proj1)$cluster == i, V(g$proj1)$cluster != i]
  #and count the nodes it is not connected to
  not_connected <- sum(colSums(as.array(t)) == 0)
  centrality <- (potential_ties - not_connected)/potential_ties
  
  
  keywords <- paste(keywords,collapse=", ")
  description <- rbind(description, keywords)
	strategic <- rbind(strategic, c(i, nodecount, total_freq, total_cwfreq, centrality, density))
}
strategic <- data.frame(strategic)
colnames(strategic)<-c("id", "size", "total_frequency", "total_coword_frequency", "centrality", "density")

###write table 2
tab2<- cbind(strategic,as.vector(description))
write.table(file="tab_02_clusters.txt", tab2, quote=FALSE, sep="\t", row.names=FALSE)


### Plot strategic diagram
ggplot(strategic, aes(x=centrality, y=density, label=id)) + geom_point(aes(size = total_frequency)) + scale_size_area() + geom_vline(xintercept = mean(strategic$centrality))+ geom_hline(yintercept = mean(strategic$density, na.rm=TRUE)) + geom_text(hjust=2, vjust=-0.1)
ggsave(filename="fig_05_strategic.pdf")




