PlagiarismChecker

2021-05-13 15:22:22 +05:30
parent a98908f979
commit 19614b9707
2 changed files with 108 additions and 0 deletions
--- a/(AP).md
+++ b/(AP).md
@@ -3,3 +3,4 @@
 | Serial No. | Program Name |
 |------------|--------------|
 |1           | [ContentAggregator.py](https://github.com/psavarmattas/Python-Projects/blob/master/ContentAggregator.py) |
 |2           | [PlagiarismChecker.py](https://github.com/psavarmattas/Python-Projects/blob/master/PlagiarismChecker.py) |
--- a/PlagiarismChecker.py
+++ b/PlagiarismChecker.py
@@ -0,0 +1,107 @@
 """ 
 ----------------------------------------
 Plagiarism Checker 
 ----------------------------------------
 With content creation and blogging one of 
 the good businesses in the market everyone 
 wants to try their hands on this but some 
 lack sufficient funds to give their articles 
 a free plagiarism check as mostly plagiarism 
 checkers do not come for free. Building a 
 Python plagiarism checker could be built here 
 using a natural language processing library 
 along with the search API to search the first 
 few pages of Google and detect plagiarism if any. 
 ----------------------------------------
 """
 import re
 import nltk; nltk.download('punkt')
 from nltk.util import ngrams, pad_sequence, everygrams
 from nltk.tokenize import word_tokenize
 from nltk.lm import MLE, WittenBellInterpolated
 import numpy as np
 import plotly.graph_objects as go
 from scipy.ndimage import gaussian_filter
 # Training data file
 train_data_file = ""
 # read training data
 with open(train_data_file) as f:
    train_text = f.read().lower()
 # apply preprocessing (remove text inside square and curly brackets and rem punc)
 train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
 train_text = re.sub(r'[^\w\s]', "", train_text)
 # set ngram number
 n = 4
 # pad the text and tokenize
 training_data = list(pad_sequence(word_tokenize(train_text), n, 
                                pad_left=True, 
                                left_pad_symbol="<s>"))
 # generate ngrams
 ngrams = list(everygrams(training_data, max_len=n))
 print("Number of ngrams:", len(ngrams))
 # build ngram language models
 model = WittenBellInterpolated(n)
 model.fit([ngrams], vocabulary_text=training_data)
 print(model.vocab)
 # testing data file
 test_data_file = ""
 # Read testing data
 with open(test_data_file) as f:
    test_text = f.read().lower()
 test_text = re.sub(r'[^\w\s]', "", test_text)
 # Tokenize and pad the text
 testing_data = list(pad_sequence(word_tokenize(test_text), n, 
                                pad_left=True,
                                left_pad_symbol="<s>"))
 print("Length of test data:", len(testing_data))
 # assign scores
 scores = []
 for i, item in enumerate(testing_data[n-1:]):
    s = model.score(item, testing_data[i:i+n-1])
    scores.append(s)
 scores_np = np.array(scores)
 # set width and height
 width = 8
 height = np.ceil(len(testing_data)/width).astype("int32")
 print("Width, Height:", width, ",", height)
 # copy scores to rectangular blank array
 a = np.zeros(width*height)
 a[:len(scores_np)] = scores_np
 diff = len(a) - len(scores_np)
 # apply gaussian smoothing for aesthetics
 a = gaussian_filter(a, sigma=1.0)
 # reshape to fit rectangle
 a = a.reshape(-1, width)
 # format labels
 labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)]
 labels_individual = [x.split() for x in labels]
 labels_individual[-1] += [""]*diff
 labels = [f"{x:60.60}" for x in labels]
 # create heatmap
 fig = go.Figure(data=go.Heatmap(
                z=a, x0=0, dx=1,
                y=labels, zmin=0, zmax=1,
                customdata=labels_individual,
                hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                colorscale="burg"))
 fig.update_layout({"height":height*28, "width":1000, "font":{"family":"Courier New"}})
 fig['layout']['yaxis']['autorange'] = "reversed"
 fig.show()