PlagiarismChecker
This commit is contained in:
@@ -3,3 +3,4 @@
|
|||||||
| Serial No. | Program Name |
|
| Serial No. | Program Name |
|
||||||
|------------|--------------|
|
|------------|--------------|
|
||||||
|1 | [ContentAggregator.py](https://github.com/psavarmattas/Python-Projects/blob/master/ContentAggregator.py) |
|
|1 | [ContentAggregator.py](https://github.com/psavarmattas/Python-Projects/blob/master/ContentAggregator.py) |
|
||||||
|
|2 | [PlagiarismChecker.py](https://github.com/psavarmattas/Python-Projects/blob/master/PlagiarismChecker.py) |
|
||||||
107
PlagiarismChecker.py
Normal file
107
PlagiarismChecker.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
"""
|
||||||
|
----------------------------------------
|
||||||
|
Plagiarism Checker
|
||||||
|
----------------------------------------
|
||||||
|
With content creation and blogging one of
|
||||||
|
the good businesses in the market everyone
|
||||||
|
wants to try their hands on this but some
|
||||||
|
lack sufficient funds to give their articles
|
||||||
|
a free plagiarism check as mostly plagiarism
|
||||||
|
checkers do not come for free. Building a
|
||||||
|
Python plagiarism checker could be built here
|
||||||
|
using a natural language processing library
|
||||||
|
along with the search API to search the first
|
||||||
|
few pages of Google and detect plagiarism if any.
|
||||||
|
----------------------------------------
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import nltk; nltk.download('punkt')
|
||||||
|
from nltk.util import ngrams, pad_sequence, everygrams
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.lm import MLE, WittenBellInterpolated
|
||||||
|
import numpy as np
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
from scipy.ndimage import gaussian_filter
|
||||||
|
|
||||||
|
# Training data file
|
||||||
|
train_data_file = ""
|
||||||
|
|
||||||
|
# read training data
|
||||||
|
with open(train_data_file) as f:
|
||||||
|
train_text = f.read().lower()
|
||||||
|
|
||||||
|
# apply preprocessing (remove text inside square and curly brackets and rem punc)
|
||||||
|
train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
|
||||||
|
train_text = re.sub(r'[^\w\s]', "", train_text)
|
||||||
|
|
||||||
|
# set ngram number
|
||||||
|
n = 4
|
||||||
|
|
||||||
|
# pad the text and tokenize
|
||||||
|
training_data = list(pad_sequence(word_tokenize(train_text), n,
|
||||||
|
pad_left=True,
|
||||||
|
left_pad_symbol="<s>"))
|
||||||
|
|
||||||
|
# generate ngrams
|
||||||
|
ngrams = list(everygrams(training_data, max_len=n))
|
||||||
|
print("Number of ngrams:", len(ngrams))
|
||||||
|
|
||||||
|
# build ngram language models
|
||||||
|
model = WittenBellInterpolated(n)
|
||||||
|
model.fit([ngrams], vocabulary_text=training_data)
|
||||||
|
print(model.vocab)
|
||||||
|
|
||||||
|
# testing data file
|
||||||
|
test_data_file = ""
|
||||||
|
|
||||||
|
# Read testing data
|
||||||
|
with open(test_data_file) as f:
|
||||||
|
test_text = f.read().lower()
|
||||||
|
test_text = re.sub(r'[^\w\s]', "", test_text)
|
||||||
|
|
||||||
|
# Tokenize and pad the text
|
||||||
|
testing_data = list(pad_sequence(word_tokenize(test_text), n,
|
||||||
|
pad_left=True,
|
||||||
|
left_pad_symbol="<s>"))
|
||||||
|
print("Length of test data:", len(testing_data))
|
||||||
|
|
||||||
|
# assign scores
|
||||||
|
scores = []
|
||||||
|
for i, item in enumerate(testing_data[n-1:]):
|
||||||
|
s = model.score(item, testing_data[i:i+n-1])
|
||||||
|
scores.append(s)
|
||||||
|
|
||||||
|
scores_np = np.array(scores)
|
||||||
|
|
||||||
|
# set width and height
|
||||||
|
width = 8
|
||||||
|
height = np.ceil(len(testing_data)/width).astype("int32")
|
||||||
|
print("Width, Height:", width, ",", height)
|
||||||
|
|
||||||
|
# copy scores to rectangular blank array
|
||||||
|
a = np.zeros(width*height)
|
||||||
|
a[:len(scores_np)] = scores_np
|
||||||
|
diff = len(a) - len(scores_np)
|
||||||
|
|
||||||
|
# apply gaussian smoothing for aesthetics
|
||||||
|
a = gaussian_filter(a, sigma=1.0)
|
||||||
|
|
||||||
|
# reshape to fit rectangle
|
||||||
|
a = a.reshape(-1, width)
|
||||||
|
|
||||||
|
# format labels
|
||||||
|
labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)]
|
||||||
|
labels_individual = [x.split() for x in labels]
|
||||||
|
labels_individual[-1] += [""]*diff
|
||||||
|
labels = [f"{x:60.60}" for x in labels]
|
||||||
|
|
||||||
|
# create heatmap
|
||||||
|
fig = go.Figure(data=go.Heatmap(
|
||||||
|
z=a, x0=0, dx=1,
|
||||||
|
y=labels, zmin=0, zmax=1,
|
||||||
|
customdata=labels_individual,
|
||||||
|
hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
|
||||||
|
colorscale="burg"))
|
||||||
|
fig.update_layout({"height":height*28, "width":1000, "font":{"family":"Courier New"}})
|
||||||
|
fig['layout']['yaxis']['autorange'] = "reversed"
|
||||||
|
fig.show()
|
||||||
Reference in New Issue
Block a user