diff --git a/sequence_tagging_for_ner/data/download.sh b/sequence_tagging_for_ner/data/download.sh index fc9de3d7f2bacb82361bffbf53f109ff6f3c9060..99d81c1e0949e47187cd082947117eb4e6bd888d 100644 --- a/sequence_tagging_for_ner/data/download.sh +++ b/sequence_tagging_for_ner/data/download.sh @@ -1,4 +1,8 @@ -wget http://cs224d.stanford.edu/assignment2/assignment2.zip +if [ -f assignment2.zip ]; then + echo "data exist" +else + wget http://cs224d.stanford.edu/assignment2/assignment2.zip +fi if [ $? -eq 0 ];then unzip assignment2.zip diff --git a/sequence_tagging_for_ner/reader.py b/sequence_tagging_for_ner/reader.py index 2662abe80b5bf18459c7ffb6c48fdbf73a3bb970..5050d0bf499e59db505758b0af9eed71e6af7de7 100644 --- a/sequence_tagging_for_ner/reader.py +++ b/sequence_tagging_for_ner/reader.py @@ -21,7 +21,7 @@ def canonicalize_word(word, wordset=None, digits=True): if (wordset != None) and (word in wordset): return word word = canonicalize_digits(word) # try to canonicalize numbers if (wordset == None) or (word in wordset): return word - else: return "" # unknown token + else: return "UUUNKKK" # unknown token def data_reader(data_file, word_dict, label_dict): @@ -35,7 +35,7 @@ def data_reader(data_file, word_dict, label_dict): """ def reader(): - UNK_IDX = word_dict[""] + UNK_IDX = word_dict["UUUNKKK"] sentence = [] labels = [] diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py index dd041b6aaaebee0e263abf4bea035d578344409f..5facfeda0d7152bdc065e141d73392eb51e4c2f8 100644 --- a/sequence_tagging_for_ner/train.py +++ b/sequence_tagging_for_ner/train.py @@ -106,4 +106,5 @@ if __name__ == "__main__": test_data_file="data/test", vocab_file="data/vocab.txt", target_file="data/target.txt", - emb_file="data/wordVectors.txt") + emb_file="data/wordVectors.txt", + model_save_dir="model/")