diff --git a/Order Of Programs (AP) .md b/Order Of Programs (AP).md similarity index 62% rename from Order Of Programs (AP) .md rename to Order Of Programs (AP).md index 071d2e3..0304a9a 100644 --- a/Order Of Programs (AP) .md +++ b/Order Of Programs (AP).md @@ -3,3 +3,4 @@ | Serial No. | Program Name | |------------|--------------| |1 | [ContentAggregator.py](https://github.com/psavarmattas/Python-Projects/blob/master/ContentAggregator.py) | +|2 | [PlagiarismChecker.py](https://github.com/psavarmattas/Python-Projects/blob/master/PlagiarismChecker.py) | diff --git a/PlagiarismChecker.py b/PlagiarismChecker.py new file mode 100644 index 0000000..e3fefc8 --- /dev/null +++ b/PlagiarismChecker.py @@ -0,0 +1,107 @@ +""" +---------------------------------------- +Plagiarism Checker +---------------------------------------- +With content creation and blogging one of +the good businesses in the market everyone +wants to try their hands on this but some +lack sufficient funds to give their articles +a free plagiarism check as mostly plagiarism +checkers do not come for free. Building a +Python plagiarism checker could be built here +using a natural language processing library +along with the search API to search the first +few pages of Google and detect plagiarism if any. +---------------------------------------- +""" +import re +import nltk; nltk.download('punkt') +from nltk.util import ngrams, pad_sequence, everygrams +from nltk.tokenize import word_tokenize +from nltk.lm import MLE, WittenBellInterpolated +import numpy as np +import plotly.graph_objects as go +from scipy.ndimage import gaussian_filter + +# Training data file +train_data_file = "" + +# read training data +with open(train_data_file) as f: + train_text = f.read().lower() + +# apply preprocessing (remove text inside square and curly brackets and rem punc) +train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text) +train_text = re.sub(r'[^\w\s]', "", train_text) + +# set ngram number +n = 4 + +# pad the text and tokenize +training_data = list(pad_sequence(word_tokenize(train_text), n, + pad_left=True, + left_pad_symbol="")) + +# generate ngrams +ngrams = list(everygrams(training_data, max_len=n)) +print("Number of ngrams:", len(ngrams)) + +# build ngram language models +model = WittenBellInterpolated(n) +model.fit([ngrams], vocabulary_text=training_data) +print(model.vocab) + +# testing data file +test_data_file = "" + +# Read testing data +with open(test_data_file) as f: + test_text = f.read().lower() +test_text = re.sub(r'[^\w\s]', "", test_text) + +# Tokenize and pad the text +testing_data = list(pad_sequence(word_tokenize(test_text), n, + pad_left=True, + left_pad_symbol="")) +print("Length of test data:", len(testing_data)) + +# assign scores +scores = [] +for i, item in enumerate(testing_data[n-1:]): + s = model.score(item, testing_data[i:i+n-1]) + scores.append(s) + +scores_np = np.array(scores) + +# set width and height +width = 8 +height = np.ceil(len(testing_data)/width).astype("int32") +print("Width, Height:", width, ",", height) + +# copy scores to rectangular blank array +a = np.zeros(width*height) +a[:len(scores_np)] = scores_np +diff = len(a) - len(scores_np) + +# apply gaussian smoothing for aesthetics +a = gaussian_filter(a, sigma=1.0) + +# reshape to fit rectangle +a = a.reshape(-1, width) + +# format labels +labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)] +labels_individual = [x.split() for x in labels] +labels_individual[-1] += [""]*diff +labels = [f"{x:60.60}" for x in labels] + +# create heatmap +fig = go.Figure(data=go.Heatmap( + z=a, x0=0, dx=1, + y=labels, zmin=0, zmax=1, + customdata=labels_individual, + hovertemplate='%{customdata}
Score:%{z:.3f}', + colorscale="burg")) +fig.update_layout({"height":height*28, "width":1000, "font":{"family":"Courier New"}}) +fig['layout']['yaxis']['autorange'] = "reversed" +fig.show() \ No newline at end of file