1. 程式人生 > >基於負取樣的skip-garm的語言模型實現-R

基於負取樣的skip-garm的語言模型實現-R

基本思路:

已知詞w,在文章中統計其上下文u1,u2。。。在負樣本集中選取負樣本u3、u4。。。

詞w的詞向量與其對應的每個樣本向量乘積,利用sigmod函式求得概率估計值。與標記值target的殘差求梯度下降,優化輸入詞向量、權值向量、偏置向量。

問題:

初始化輸入詞向量、權值向量、偏置向量時如果值過大,那麼wx+b就過大,導致sigmod值區域正負無窮,殘差值出現INF值。

結果:

1、本次只利用了34句關於找工作的話題的話語來訓練模型,模型最後的結果:

string1 <- "找好工作,主要是有經驗"
string2 <- "工作找經驗"


pro1 <- getpro(string1) #23.7
pro2 <- getpro(string2) #4.91

         2、訓練的詞向量,降維展示在二維空間內:

“理解”、“溝通”、“長得帥” 三個詞的距離很接近。。。。

原來長得帥也是找工作的一個充分條件。。。。

#設定工作目錄

setwd("../buaa/sahomework")
#讀取資料
answer <- read.csv("./data/answer3new",
                   encoding = "utf-8",
                   sep = ",",
                   header = FALSE,
                   stringsAsFactors = FALSE)


#處理資料dataframe
names(answer) <- c("questionid","id","username","zan","answer")
answer$questionid <- as.character(answer$questionid)
answer$id <- as.character(answer$id)


#先拿小樣本嘗試,取某一個問題的全部回答“655467276313526272”
answers <- answer[which(answer$questionid == "655467276313526272"),]


#answers分詞
library(jiebaR)
wk <- worker()
anscorpus <- data.frame("anscorpus" = wk[answers$answer])


#先不處理停頓次
#停頓次是否需要去掉
# removeStopWords = function(x,words) {
#   ret = character(0)
#   index <- 1
#   it_max <- length(x)
#   while (index <= it_max) {
#     if (length(words[words==x[index]]) <1) 
#       ret <- c(ret,x[index])
#     index <- index +1
#   }
#   ret
# }
# stopwords <- data.frame("stopwords" = "的")




# corpus <- lapply(as.character(anscorpus), removeStopWords,stopwords)
# corpus <- data.frame("corpus" = unlist(strsplit(as.character(anscorpus),split=",")))
corpus <- anscorpus
#語料庫落地儲存
write.csv(corpus,file = "data/words.csv",col.names = FALSE,row.names = FALSE)


#處理corpus,按照詞頻進行排序,序號為該詞的index ,負取樣方便 190個詞
corpusFreq <- data.frame(table(corpus))
corpusFreq <- corpusFreq[order(corpusFreq$Freq,decreasing = T),]
corpusFreq$id <- c(1:190)
summary(corpusFreq)


#詞雲展示詞頻
install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)
par(family='STXihei') #不起作用,需要在wordcloud中設定??
png(file = "wordcloud.png",bg = "white",width = 480,height = 480)
colors = brewer.pal(8,"Dark2")
wordcloud(corpusFreq$corpus,corpusFreq$Freq,
          scale=c(3,0.5),
          min.freq=-Inf,
          max.words=190,
          colors=colors,
          random.order=F,
          random.color=F,
          ordered.colors=F,
          family='STXihei')
dev.off()


#把回答翻譯成id的文章 34句話, 302個詞的詞串
#以便提取上下文詞語,句末新增"." -- 暫時不新增
charaIndex <- ""
unuseChara <- 0
for(i in c(1:dim(corpus)[1])){
  if(corpus[i,1] %in% corpusFreq$corpus){
    # print(i)
    chara <- corpusFreq[corpusFreq$corpus == corpus[i,1], 3]
    charaIndex <- paste(charaIndex, chara,sep = ",")
  }else{
      unuseChara <- unuseChara + 1
    }
}
# for(j in c(1:dim(answers)[1])){
#   charactors <- unlist(strsplit(answers[j,5],split = ""))#列名不能提取列??
#   len <- length(charactors)
#   
#   for (i in c(1:len)) {
#     if(charactors[i] %in% corpusFreq$corpus){
#       chara <- corpusFreq[corpusFreq$corpus == charactors[i], 3]
#       charaIndex <- paste(charaIndex, chara,sep = ",")
#     }else{
#       unuseChara <- unuseChara + 1
#     }
#   }
#   # charaIndex <- paste(charaIndex,".",sep = ",")
# }


#生成上下文,corpusFreq$context紀錄該詞所有的num_skip=2的上下文
corpusFreq$context <- NULL
# num_skip <- 2
# batch_size <- 190


  chara <- unlist(strsplit(charaIndex,split = ","))
  chara <- chara[-1]#chara[1]是空 218個詞
  for (i in c(1:length(chara))) {
    if(i > 1){
      oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]
      corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i-1],sep = ",") 
    }
    if(i < length(chara)){
      oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]
      corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i+1],sep = ",") 
    }
  }
  names(corpusFreq)[4] <- "context"
  #對上下進行修正,沒有的補上
  
  
#構建負取樣矩陣190*5
valid_sample <- matrix(0,nrow = 190,ncol = 5)


for(i in c(1:dim(corpusFreq)[1])){
  quene <- c(1:dim(corpusFreq)[1])
  quene[-i]
  valid_sample[i,] <- sample(quene,5,replace = F)
}




#構建logits矩陣,每一行是一個詞的2個正樣本+5個負樣本 結果是190*7
contextmatrix <- matrix(0,nrow = 190,ncol = 2)
for(i in c(1:dim(corpusFreq)[1])){
  contextlist <- unlist(strsplit(corpusFreq[i,4],split = ","))
  if(contextlist[1] == "NA"){
    context <- contextlist[c(2:3)]
  }else{
    context <- contextlist[c(1:2)]
  }
  contextmatrix[i,] <- context
}
contextM <- data.frame(cbind(contextmatrix,valid_sample))
# contextM <- lapply(contextM[,],as.numeric)
# contextM <- data.frame(contextM)
# contextM[is.na(contextM)] <- 0
names(contextM) <- c("prefix","suffix","valid1","valid2","valid3","valid4","valid5")


#標記矩陣
target1 <- matrix(1,nrow = 190,ncol = 2)
target2 <- matrix(0,nrow = 190,ncol = 5)
target <- cbind(target1,target2)


# #交叉熵遞迴下降 求解train_input
# #交叉熵:logits - logits * target + ln(1 + exp(-logits))
# cross_entropy <- logits - logits * target + log(1 + exp(-logits))
# sum_row <- data.frame(rowSums(cross_entropy))


#輪訓對一個樣本進行隨機梯度下降
sigmod = function(x){
  return(1 / 1 + exp(-x))
}


logits <- logits <- matrix(0,nrow = 190,ncol = 7)


#x 190*128 labels 190*1 W 190*128 B 190*1 
nce_weight <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)
nce_biases <- matrix(runif(190,-0.1,0.1),nrow = 190)


train_inputs <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)
# nce_weight <- nce_weight2
# nce_biases <- nce_biases2
# train_inputs <- train_inputs2
# train_labels <- matrix(runif(190,-1,1),nrow = 190)


#logit矩陣,方便除錯sigmod函式,防止出現正負無窮
genrate_logits = function(){
  logits <- matrix(0,nrow = 190,ncol = 7)
  for(i in c(1:dim(train_inputs)[1])){
    x <- t(data.frame(train_inputs[i,]))
    w <- t(data.frame(nce_weight[as.integer(contextM[i,]),]))
    logits[i,] <- x %*% w + nce_biases[i]
  }
  return(logits)
}
logits2 <- genrate_logits()


#梯度下降
maxiter <- 190
# minerror <- 0.01
step <- 0.01
# newerror <- 1
iter <- 0 #迴圈次數
len <- dim(train_inputs)[1]
i <- 1 #train_inputs中的第i個樣本
while(iter <= maxiter){
  # print("=========")
  des <- matrix(0,nrow = 128,ncol = 1)
  iter <- iter + 1
  if(i > len){i <- i %% len}
  print(i)
  x <- t(data.frame(train_inputs[i,]))
  w <- t(data.frame(nce_weight[as.numeric(contextM[i,]),]))
  #wx + b 的sigmod值,1*7矩陣,計算每個樣本的殘差進行修正
  logits[i,] <- x %*% w + matrix(nce_biases[as.numeric(contextM[i,]),],nrow = 1,ncol = 7)
  
  
  #依次更新weight和biase
  for(j in c(1:length(logits[1,]))){
    #出現了-Inf和Inf,然後NaN,sigmod函式當值較大或者較小時函式值區域無窮,
    #縮小初始化隨機變數的取值範圍
    des <- (sigmod(logits[i,j]) - target[i,j]) * as.matrix(train_inputs[i,])#128*1
    #更新x
    train_inputs[i,] <- as.matrix(train_inputs[i,]) - step * des
    # print("=====更新train=====")
    print(des[1,1])
    #更新w
    nce_weight[as.integer(contextM[i,j]),] <- 
      as.matrix(nce_weight[as.integer(contextM[i,j]),]) - step * des
    nce_biases[as.integer(contextM[i,j]),] <- nce_biases[as.integer(contextM[i,j]),] - step * (t(des) %*% des)
    
  }
  i <- i + 1
}
  
#對詞向量進行視覺化
#pca分析
pca <- princomp(train_inputs[,],cor = TRUE,scores = TRUE)
plot(pca, type="lines")
biplot(pca)


#計算MDS
dis <- dist(train_inputs,diag = TRUE,upper = TRUE )
# fit <- hclust(dis,method = "ward.D")
# plot(fit)
dismatrix <- as.matrix(dis)
mds <- cmdscale(dismatrix,k = 2)
par(family = "STXihei")
plot(mds[,1],mds[,2],type = "n",col = "red")
text(mds[,1],mds[,2],labels = corpusFreq$corpus,cex = 0.5,col = "black")


#計算語句出現概率
getpro = function(s){
  testcorpus <- data.frame("corpus" = wk[s])
  for (i in c(1:dim(testcorpus)[1])) {
    testcorpus$id[i] <- corpusFreq[as.character(corpusFreq$corpus) == testcorpus[i,1],3]
  }
  pro <- 0
  len <- dim(testcorpus)[1] - 1
  for (i in c(2:len)){
    prepro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i-1,2],] + nce_biases[testcorpus[i-1,2],])
    sufpro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i+1,2],] + nce_biases[testcorpus[i+1,2],])
    proi <- prepro * sufpro
    pro <- pro + proi
  }
  return(pro)
}


string1 <- "找好工作,主要是有經驗"
string2 <- "工作找經驗"


pro1 <- getpro(string1) #23.7
pro2 <- getpro(string2) #4.91