rnnのtheanoにグラデーションステップを書くことができません

次のコードでは、単語を1つのホットベクトルに変換し、単語のシーケンス（基本的には言語モデル）を与えられた次の単語を予測するためにrnnを使用してtheanのグラデーションディセントを行います。rnnのtheanoにグラデーションステップを書くことができません

# coding: utf-8 

# In[68]: 

#Importing stuff 
import theano 
import theano.tensor as T 
import numpy as np 


# In[69]: 

import nltk 
import sys 
import operator 
import csv 
import itertools 
from utils import * 
from datetime import datetime 


# In[70]: 

#Fixing vocabulary size for one hot vectors and some initialization stuff 
v_size = 8000 
unknown_token = "UNKNOWN_TOKEN" 
start_token = "<s>" 
end_token = "</s>" 


# In[71]: 

#Read data and start preprocessing 
with open('reddit-comments-2015-08.csv','rb') as f: 
    reader = csv.reader(f, skipinitialspace=True) 
    reader.next() 
    sentences = list(itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8')) for x in reader])) 
    print len(sentences) 


# In[72]: 

#Tokenize the sentences and add start and end tokens 
tokenized_sentences = [nltk.word_tokenize(s) for s in sentences] 
tokenized_sentences = [[start_token] + s + [end_token] for s in tokenized_sentences] 


# In[73]: 

#Get word frequencies and use only most frequent words in vocabulary 
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) 
vocab = word_freq.most_common(v_size-1) 


# In[74]: 

#Do mapping and reverse mapping 
index_to_word = [x[0] for x in vocab] 
index_to_word.append(unknown_token) 
word_to_index = {w:i for i,w in enumerate(index_to_word)} 

#Removing less frequent words 
for i, s in enumerate(tokenized_sentences): 
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in s] 

#Got vectors but they are not one hot 
X_train = np.asarray([[word_to_index[w] for w in s[:-1]] for s in tokenized_sentences]) 
Y_train = np.asarray([[word_to_index[w] for w in s[1:]] for s in tokenized_sentences]) 
#Preprocessing ends here 


# In[75]: 

#Take only one sentence for now 
X_train = X_train[0] 
Y_train = Y_train[0] 


# In[76]: 

#Make input and output as onehot vectors. This can easily be replaced with vectors generated by word2vec. 
X_train_onehot = np.eye(v_size)[X_train] 
X = theano.shared(np.array(X_train_onehot).astype('float32'), name = 'X') 
Y_train_onehot = np.eye(v_size)[Y_train] 
Y = theano.shared(np.array(Y_train_onehot).astype('float32'), name = 'Y') 


# In[77]: 

#Initializing U, V and W 
i_dim = v_size 
h_dim = 100 
o_dim = v_size 

U = theano.shared(np.random.randn(i_dim, h_dim).astype('float32'), name = 'U') 
W = theano.shared(np.random.randn(h_dim, h_dim).astype('float32'), name = 'W') 
V = theano.shared(np.random.randn(h_dim, o_dim).astype('float32'), name = 'V') 


# In[78]: 

#forward propagation 
s = T.vector('s') 

results, updates = theano.scan(lambda x, sm1: T.tanh(T.dot(x, U) + T.dot(sm1, W)), 
           sequences = X_train_onehot, 
           outputs_info = s 
          ) 
y_hat = T.dot(results, V) 

forward_propagation = theano.function(inputs=[s], outputs = y_hat) 


# In[80]: 

#loss 
loss = T.sum(T.nnet.categorical_crossentropy(y_hat, Y)) 


# In[81]: 

#Gradients 
dw = T.grad(loss, W) 
du = T.grad(loss, U) 
dv = T.grad(loss, V) 


# In[82]: 

#BPTT 
learning_rate = T.scalar('learning_rate') 
gradient_step = theano.function(inputs = [s, learning_rate], 
           updates = (
           (U, U - learning_rate * du), 
           (V, V - learning_rate * dv), 
           (W, W - learning_rate * dw) 
           ) 
           ) 


# In[ ]:

ただし、グラジエントステップでエラーがスローされます。どのステップがエラーに影響しているのかわからないので、私は完全なコードを投稿しています。以下は、jupyterノートブックのエラーのスクリーンショットです。