""" ---------------------------------------- Plagiarism Checker ---------------------------------------- With content creation and blogging one of the good businesses in the market everyone wants to try their hands on this but some lack sufficient funds to give their articles a free plagiarism check as mostly plagiarism checkers do not come for free. Building a Python plagiarism checker could be built here using a natural language processing library along with the search API to search the first few pages of Google and detect plagiarism if any. ---------------------------------------- """ import re import nltk; nltk.download('punkt') from nltk.util import ngrams, pad_sequence, everygrams from nltk.tokenize import word_tokenize from nltk.lm import MLE, WittenBellInterpolated import numpy as np import plotly.graph_objects as go from scipy.ndimage import gaussian_filter # Training data file train_data_file = "" # read training data with open(train_data_file) as f: train_text = f.read().lower() # apply preprocessing (remove text inside square and curly brackets and rem punc) train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text) train_text = re.sub(r'[^\w\s]', "", train_text) # set ngram number n = 4 # pad the text and tokenize training_data = list(pad_sequence(word_tokenize(train_text), n, pad_left=True, left_pad_symbol="")) # generate ngrams ngrams = list(everygrams(training_data, max_len=n)) print("Number of ngrams:", len(ngrams)) # build ngram language models model = WittenBellInterpolated(n) model.fit([ngrams], vocabulary_text=training_data) print(model.vocab) # testing data file test_data_file = "" # Read testing data with open(test_data_file) as f: test_text = f.read().lower() test_text = re.sub(r'[^\w\s]', "", test_text) # Tokenize and pad the text testing_data = list(pad_sequence(word_tokenize(test_text), n, pad_left=True, left_pad_symbol="")) print("Length of test data:", len(testing_data)) # assign scores scores = [] for i, item in enumerate(testing_data[n-1:]): s = model.score(item, testing_data[i:i+n-1]) scores.append(s) scores_np = np.array(scores) # set width and height width = 8 height = np.ceil(len(testing_data)/width).astype("int32") print("Width, Height:", width, ",", height) # copy scores to rectangular blank array a = np.zeros(width*height) a[:len(scores_np)] = scores_np diff = len(a) - len(scores_np) # apply gaussian smoothing for aesthetics a = gaussian_filter(a, sigma=1.0) # reshape to fit rectangle a = a.reshape(-1, width) # format labels labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)] labels_individual = [x.split() for x in labels] labels_individual[-1] += [""]*diff labels = [f"{x:60.60}" for x in labels] # create heatmap fig = go.Figure(data=go.Heatmap( z=a, x0=0, dx=1, y=labels, zmin=0, zmax=1, customdata=labels_individual, hovertemplate='%{customdata}
Score:%{z:.3f}', colorscale="burg")) fig.update_layout({"height":height*28, "width":1000, "font":{"family":"Courier New"}}) fig['layout']['yaxis']['autorange'] = "reversed" fig.show()