@@ -13,8 +13,8 @@ library(proxy)
1313library(SnowballC )
1414library(rplos )
1515library(jsonlite )
16- # library(lsa)
17- # library(stylo)
16+
17+ debug = FALSE
1818
1919# Get data from PLOS API
2020start.time <- Sys.time()
@@ -64,41 +64,30 @@ corpus <- tm_map(corpus, stemDocument)
6464
6565tdm <- TermDocumentMatrix(corpus )
6666
67- # tdm <- weightTfIdf(tdm, normalize = TRUE)
68-
6967tdm <- removeSparseTerms(tdm , 0.3 )
7068
7169tdm_matrix = t(as.matrix(tdm ))
72- # diag(tdm_matrix) <- NA
70+
7371distance_matrix_2 <- as.matrix(proxy :: dist(tdm_matrix , method = " cosine" ))
7472distance_matrix = as.dist(distance_matrix_2 )
7573
76- # td.mat <- as.matrix(TermDocumentMatrix(corpus))
77- # td.mat.lsa <- lw_bintf(td.mat) * gw_idf(td.mat)
78- # lsaSpace <- lsa(td.mat.lsa)
79- # tdm_matrix = t(as.textmatrix(lsaSpace))
80- # distance_matrix_2 <- as.matrix(proxy::dist(tdm_matrix, method = "cosine"))
81- # distance_matrix = as.dist(distance_matrix_2)
82-
83- # distance_matrix <- apply(distance_matrix, 2, mean, na.rm=TRUE)
84- # write.csv(as.matrix(distance_matrix), "matrix.csv")
85-
86-
8774# Perform clustering, use elbow to determine a good number of clusters
8875css_cluster <- css.hclust(distance_matrix , hclust.FUN.MoreArgs = list (method = " ward.D" ))
8976cut_off = elbow.batch(css_cluster )
90- # cut_off = elbow.batch(css_cluster,inc.thres=c(0.01,0.05,0.1),
91- # ev.thres=c(0.95,0.9,0.8,0.75,0.67,0.5,0.33,0.2,0.1),precision=3)
77+
9278num_clusters = cut_off $ k
9379meta_cluster = attr(css_cluster ," meta" )
9480cluster = meta_cluster $ hclust.obj
9581labels = labels(distance_matrix )
82+ groups <- cutree(cluster , k = num_clusters )
9683
97- # Plot result of clustering to PDF file
98- # pdf("clustering.pdf", width=19, height=12)
99- # plot(cluster, labels=metadata$title, cex=0.6)
100- # rect.hclust(cluster, k=num_clusters, border="red")
101- # dev.off()
84+ if (debug == TRUE ) {
85+ # Plot result of clustering to PDF file
86+ pdf(" clustering.pdf" , width = 19 , height = 12 )
87+ plot(cluster , labels = metadata $ title , cex = 0.6 )
88+ rect.hclust(cluster , k = num_clusters , border = " red" )
89+ dev.off()
90+ }
10291
10392num_clusters
10493
@@ -108,70 +97,35 @@ nm.nmin = nmds.min(nm)
10897x = nm.nmin $ X1
10998y = nm.nmin $ X2
11099
111- # Plot results from multidimensional scaling, highlight clusters with symbols
112- # pdf("mds.pdf")
113- groups <- cutree(cluster , k = num_clusters )
114- # plot(nm.nmin, pch=groups)
115- # dev.off()
100+ if (debug == TRUE ) {
101+ # Plot results from multidimensional scaling, highlight clusters with symbols
102+ pdf(" mds.pdf" )
103+ plot(nm.nmin , pch = groups )
104+ dev.off()
105+ }
116106
117107# Prepare the output
118108result = cbind(x ,y ,groups ,labels )
119109output = merge(metadata , result , by.x = " id" , by.y = " labels" , all = TRUE )
120110names(output )[names(output )== " groups" ] <- " area_uri"
121111output [" area" ] = paste(" Cluster " , output $ area_uri , sep = " " )
122112
123- # BigramTokenizer <-
124- # function(x)
125- # unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
126-
127- # dtm <- DocumentTermMatrix(corpus_unstemmed, control = list(tokenize = BigramTokenizer))
128-
129- # m_naming <- list(content = "titleabstract", id = "id")
130- #
131- # myReader_naming <- readTabular(mapping = m_naming)
132- #
133- # (corpus_naming <- Corpus(DataframeSource(cooc), readerControl = list(reader = myReader_naming)))
134- #
135- # corpus_naming <- tm_map(corpus_naming, removePunctuation)
136- #
137- # corpus_naming <- tm_map(corpus_naming, stripWhitespace)
138- #
139- # corpus_naming <- tm_map(corpus_naming, content_transformer(tolower))
140- #
141- # corpus_naming <- tm_map(corpus_naming, removeWords, stopwords("english"))
142-
143- # corpus_naming <- tm_map(corpus_naming, stemDocument)
144-
145- # subjects = cooc$subject
146- # subjects = strsplit(subjects, "; ")
147- # output$subjects_cleaned = sub(".*[/]", "", subjects)
148-
149- # dtm = DocumentTermMatrix(corpus_unstemmed)
150- # dtm = weightTfIdf(dtm)
151-
152- # for (i in 1:num_clusters) {
153- # inGroup <- which(output$area_uri==i)
154- # within <- table(inGroup$subjects_cleaned)
155- # most_freq_term = sort(colSums(as.matrix(within)), decreasing=TRUE)[1:4]
156- # output$area[output$area_uri==i] = paste(names(most_freq_term), collapse=", ")
157- # }
158-
159- # output$area
160-
161113output_json = toJSON(output )
162114print(output_json )
163115
164- # Write output to file
165- # file_handle = file("output_file.csv", open="w")
166- # write.csv(output, file=file_handle, row.names=FALSE)
167- # close(file_handle)
168- #
169- # # Write some stats to a file
170- # file_handle = file("stats.txt", open="w")
171- # writeLines(c(paste("Number of Clusters:", num_clusters, sep=" ")
172- # , paste("Description:", attributes(cut_off)$description)
173- # , paste("Stress:", min(nm$stress), sep=" ")
174- # , paste("R2:", max(nm$r2), sep=" ")
175- # ), file_handle)
176- #
177- # close(file_handle)
116+ if (debug == TRUE ) {
117+ # Write output to file
118+ file_handle = file(" output_file.csv" , open = " w" )
119+ write.csv(output , file = file_handle , row.names = FALSE )
120+ close(file_handle )
121+
122+ # # Write some stats to a file
123+ file_handle = file(" stats.txt" , open = " w" )
124+ writeLines(c(paste(" Number of Clusters:" , num_clusters , sep = " " )
125+ , paste(" Description:" , attributes(cut_off )$ description )
126+ , paste(" Stress:" , min(nm $ stress ), sep = " " )
127+ , paste(" R2:" , max(nm $ r2 ), sep = " " )
128+ ), file_handle )
129+
130+ close(file_handle )
131+ }
0 commit comments