library(psych)
fit <- princomp(mydata, cor=TRUE)
summary(fit) # print variance accounted for
fit <- princomp(mydata, cor=TRUE)
library(sem)
setwd("~/BTSync/Phd/Studies/ubicomp_keyword_analysis/automated_analysis/public_release")
library(tm)
library(FactoMineR)
library(igraph)
library(ggplot2)
#############
####SETTINGS
#############
##Which years we want to focus on (from, to)
###Note: Correspondence analysis needs at least 3 years
MIN_YEAR = 2008
MAX_YEAR = 2013
##How many clusters
NUM_CLUSTERS = 13#12 #13
##Ignore keywords with too low or too high occurrence
MIN_FREQ = 76# 15 #76
MAX_FREQ = 10000
##In the keyword graph, show only strong edges (over threshold)
STRONG_EDGE = 5
#read the data
t<-read.table("input.txt", sep="\t", header=TRUE)
#convert to matrix
m<-as.matrix(t)
#Focus on the desired period:
m <- subset(m, m[,"YEAR"] >= MIN_YEAR)
m <- subset(m, m[,"YEAR"] <= MAX_YEAR)
#get rid of documents with no keywords
m <- subset(m, m[,"KEYWORDS"] != "")
#plot number of papers per year
pdf("fig_01_histogram.pdf")
plot(factor(m[,"YEAR"]))
dev.off()
######
#plot correspondence analysis
######
years<-levels((factor(m[,"YEAR"])))
allkeywords <- years
for(i in 1:length(years)) {
allkeywords[i] <- paste(subset(m, m[,"YEAR"] == years[i])[,"KEYWORDS"], collapse=" ")
}
myCorpus <- Corpus(VectorSource(allkeywords))
tdm <- as.matrix(TermDocumentMatrix(myCorpus))
colnames(tdm) <- years
pdf("fig_02_correspondance.pdf")
par(cex=0.5,font=1)
plot(CA(tdm),autoLab="yes", col.row="red", col.col="blue")
dev.off()
###Save table 1
tab1 <- cbind(years, as.vector(table(m[,"YEAR"])), colSums(tdm))
colnames(tab1) <- c("Year", "Total_papers", "Total_Keywords")
write.table(file="tab_01_statistics.txt", tab1, quote=FALSE, sep="\t")
#####
##Calculate clusters of keywords
#####
#convert to "Corpus" object
#each document (i.e. paper) has an arbitrary number of keywords
myCorpus <- Corpus(VectorSource(m[,"KEYWORDS"]))
#calculate the Term-Document matrix and co-word matrix
#shows how many times a keyword appears in each document
tdm <- as.matrix(TermDocumentMatrix(myCorpus))
#remove one keyword -- decision to hardcode this.
tdm <- subset(tdm, rownames(tdm) != "ubiquitouscomputing")
### Generate graph of keywords co-occurance
g<-graph.incidence(tdm)
g<-bipartite.projection(g)
cowords <- as.matrix(get.adjacency(g$proj1, attr="weight"))
#get rid of keywords with too low or too high co-occurance frequency
#update the document-term matrix, co-word matrix, and regenerate co-word graph
to_keep <- (rowSums(cowords) >= MIN_FREQ) & (rowSums(cowords) <= MAX_FREQ)
cowords <- subset(cowords, to_keep)
cowords <- t(cowords)
cowords <- subset(cowords, to_keep)
tdm <- subset(tdm, to_keep)
g<-graph.incidence(tdm)
g<-bipartite.projection(g)
#Get the hierarchical clustering from term-document matrix
hc <- hclust(dist(cor(t(tdm)),method="euclidean"), method="war")
## alternatively use cor(cowords)
pdf("fig_03_clusters.pdf")
par(cex=0.7,font=1)
plot(hc, hang = -1)
#plot(hc)
plotted_clusters <- rect.hclust(hc, k= NUM_CLUSTERS, border="red")
dev.off()
#Now get the clusters from the dendrogram, and apply
#them to the co-word graph so we can calculate centrality
clusters <- as.data.frame(cutree(hc,k= NUM_CLUSTERS))
#now we need to re-assign cluster numbers, so that
#the order of clusters as printed on the plot is the
#same as the numbering used in the strategic diagram.
#We could skip the next few lines without a problem
for(i in 1:length(plotted_clusters)){
match <- rownames(as.data.frame(clusters))  %in% rownames(as.data.frame(plotted_clusters[[i]]))
clusters[,1][match] <- i
}
#Assign cluster ID to each node in the graph
V(g$proj1)$cluster <- as.numeric(clusters[,1])
#or use a proper community detection algorithm
#V(g$proj1)$cluster <- spinglass.community(g$proj1)$membership
#NUM_CLUSTERS <- max(V(g$proj1)$cluster)
gg <- g$proj1
V(gg)$name <- paste(V(gg)$name,V(gg)$cluster,sep="_")
gg <- delete.edges(gg, E(gg)[E(gg)$weight <= STRONG_EDGE])
pdf("fig_04_keyword_graph.pdf")
par(cex=0.7,font=1)
plot(gg, layout=layout.fruchterman.reingold, vertex.size=5, vertex.color=V(gg)$cluster)
dev.off()
strategic <- NULL
description <- NULL
for (i in 1: NUM_CLUSTERS){
#extract cluster
c <- induced.subgraph(g$proj1, V(g$proj1)$cluster == i)
keywords <- V(c)$name
#calculate total frequency of cluster
total_freq<-sum(rowSums(tdm)[(rownames(tdm) %in% keywords)])
#calculate total co-word frequency of cluster
total_cwfreq<-sum(rowSums(cowords)[(rownames(cowords) %in% keywords)])
#calculate density
nodecount <- length(V(c))
possible_edges <- (nodecount * (nodecount-1))/2
edgecount <- sum(E(c)$weight)
density <- edgecount / possible_edges
#withi the broader network, see how many nodes the cluster can connect to,
#and see how many it actually connects to
networksize <- length(V(g$proj1))
potential_ties <- networksize - nodecount
#see who this cluster is connected to
t<-g$proj1[V(g$proj1)$cluster == i, V(g$proj1)$cluster != i]
#and count the nodes it is not connected to
not_connected <- sum(colSums(t) == 0)
centrality <- (potential_ties - not_connected)/potential_ties
#centrality <- sum(g$proj1[V(g$proj1)$cluster == i, V(g$proj1)$cluster != i])
#density <- graph.density(induced.subgraph(g$proj1, V(g$proj1)$cluster == i))
#	if(is.nan(density)){ density <- 1}
keywords <- paste(keywords,collapse=", ")
description <- rbind(description, keywords)
strategic <- rbind(strategic, c(i, nodecount, total_freq, total_cwfreq, centrality, density))
}
strategic <- data.frame(strategic)
colnames(strategic)<-c("id", "size", "total_frequency", "total_coword_frequency", "centrality", "density")
###write table 2
tab2<- cbind(strategic,as.vector(description))
write.table(file="tab_02_clusters.txt", tab2, quote=FALSE, sep="\t", row.names=FALSE)
### Plot strategic diagram
#pdf("fig_05_strategic.pdf")
#plot(strategic$centrality, strategic$density)
#abline(v=mean(strategic$centrality,na.rm=TRUE))
#abline(h=mean(strategic$density,na.rm=TRUE))
#par(cex=2,font=1)
#text(coordinates, labels=strategic[,"id"])
#dev.off()
ggplot(strategic, aes(x=centrality, y=density, label=id)) + geom_point(aes(size = total_frequency)) + scale_size_area() + geom_vline(xintercept = mean(strategic$centrality))+ geom_hline(yintercept = mean(strategic$density, na.rm=TRUE)) + geom_text(hjust=2, vjust=-0.1)
ggsave(filename="fig_05_strategic.pdf")
source('~/BTSync/Phd/Studies/ubicomp_keyword_analysis/automated_analysis/public_release/code.R')
source('~/BTSync/Phd/Studies/ubicomp_keyword_analysis/automated_analysis/public_release/code.R')
