import os import random import spacy from spacy.util import minibatch, compounding import pandas as pd TEST_REVIEW = """ Transcendently beautiful in moments outside the office, it seems almost sitcom-like in those scenes. When Toni Colette walks out and ponders life silently, it's gorgeous.

The movie doesn't seem to decide whether it's slapstick, farce, magical realism, or drama, but the best of it doesn't matter. (The worst is sort of tedious - like Office Space with less humor.) """ eval_list = [] def train_model( training_data: list, test_data: list, iterations: int = 20 ) -> None: # Build pipeline nlp = spacy.load("en_core_web_sm") if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe( "textcat", config={"architecture": "simple_cnn"} ) nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe("textcat") textcat.add_label("pos") textcat.add_label("neg") # Train only textcat training_excluded_pipes = [ pipe for pipe in nlp.pipe_names if pipe != "textcat" ] with nlp.disable_pipes(training_excluded_pipes): optimizer = nlp.begin_training() # Training loop print("Beginning training") print("Loss\tPrecision\tRecall\tF-score") batch_sizes = compounding( 4.0, 32.0, 1.001 ) # A generator that yields infinite series of input numbers for i in range(iterations): print(f"Training iteration {i}") loss = {} random.shuffle(training_data) batches = minibatch(training_data, size=batch_sizes) for batch in batches: text, labels = zip(*batch) nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss) with textcat.model.use_params(optimizer.averages): evaluation_results = evaluate_model( tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data, ) print( f"{loss['textcat']}\t{evaluation_results['precision']}" f"\t{evaluation_results['recall']}" f"\t{evaluation_results['f-score']}" ) # Save model with nlp.use_params(optimizer.averages): nlp.to_disk("model_artifacts") def evaluate_model(tokenizer, textcat, test_data: list) -> dict: reviews, labels = zip(*test_data) reviews = (tokenizer(review) for review in reviews) true_positives = 0 false_positives = 1e-8 # Can't be 0 because of presence in denominator true_negatives = 0 false_negatives = 1e-8 for i, review in enumerate(textcat.pipe(reviews)): true_label = labels[i]["cats"] for predicted_label, score in review.cats.items(): # Every cats dictionary includes both labels, you can get all # the info you need with just the pos label if predicted_label == "neg": continue if score >= 0.5 and true_label["pos"]: true_positives += 1 elif score >= 0.5 and true_label["neg"]: false_positives += 1 elif score < 0.5 and true_label["neg"]: true_negatives += 1 elif score < 0.5 and true_label["pos"]: false_negatives += 1 precision = true_positives / (true_positives + false_positives) recall = true_positives / (true_positives + false_negatives) if precision + recall == 0: f_score = 0 else: f_score = 2 * (precision * recall) / (precision + recall) return {"precision": precision, "recall": recall, "f-score": f_score} def test_model(input_data: str = TEST_REVIEW): # Load saved trained model loaded_model = spacy.load("model_artifacts") # Generate prediction parsed_text = loaded_model(input_data) # Determine prediction to return if parsed_text.cats["pos"] > parsed_text.cats["neg"]: prediction = "Positive" score = parsed_text.cats["pos"] else: prediction = "Negative" score = parsed_text.cats["neg"] print( f"Review text: {input_data}\nPredicted sentiment: {prediction}" f"\tScore: {score}" ) def load_training_data( data_directory: str = "aclImdb/train", split: float = 0.8, limit: int = 0 ) -> tuple: # Load from files reviews = [] for label in ["pos", "neg"]: labeled_directory = f"{data_directory}/{label}" for review in os.listdir(labeled_directory): if review.endswith(".txt"): with open(f"{labeled_directory}/{review}") as f: text = f.read() text = text.replace("
", "\n\n") if text.strip(): spacy_label = { "cats": { "pos": "pos" == label, "neg": "neg" == label, } } reviews.append((text, spacy_label)) random.shuffle(reviews) if limit: reviews = reviews[:limit] split = int(len(reviews) * split) return reviews[:split], reviews[split:] if __name__ == "__main__": train, test = load_training_data(limit=25) print("Training model") train_model(train, test) df = pd.DataFrame(eval_list) pd.DataFrame.plot(df) print("Testing model") test_model()