Lasse's book recommender
This is the third part of my little project to build a rating system on text which we extract from images and which in turn leads to a rating on how much I will like this book. In this notebook I want to show you how to make use of ipywidgets to make a notebook which we can use as a web appplication. Furthermore, I will show you how to download the trained model from part 2 from my private GoogleDrive. So let's get started!
!pip install googledrivedownloader
!pip install transformers
from fastai.vision.all import *
from fastai.vision.widgets import *
from fastai.vision.widgets import *
from PIL import Image, ImageFilter
import pytesseract
import re
from transformers import BertTokenizer, BertForSequenceClassification
from pathlib import Path
from torch.utils.data import TensorDataset, DataLoader
Lots of models especially in the deep learning context can get quite large. I wasn't able to upload my model into git, so I thought of a way to get around that. I uploaded my trained model from part 2 into my GoogleDrive and then use the google_drive_downloader to download my model into my notebook.
from google_drive_downloader import GoogleDriveDownloader as gdd
gdd.download_file_from_google_drive(file_id='1kk_SvwpwZeuLnZirW5vbrd8FEnm7yJRt',
dest_path='./export.pkl',
unzip=True)
import warnings
warnings.filterwarnings("ignore")
Next, we use all the steps you already know from part 2: rotate the image and filter it, use pytesseract to extract the text from the image, tokenize the text and put it in a dataloader and download the pre-trained model from the awesome huggingface library.
def proc_img(input_img):
img = input_img.rotate(angle=270, resample=0, expand=10, center=None, translate=None, fillcolor=None)
img = img.filter(ImageFilter.MedianFilter)
return img
def get_text(img):
return pytesseract.image_to_string(img, lang="deu")
def use_pattern(text):
return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
rep = {"\n": "", "`": "", '%':"", '°': '', '&':'', '‘':'', '€':'e', '®':'', '\\': '', '5':'s', '1':'i', '_':'', '-':''} # define desired replacements here
# use these three lines to do the replacement
rep = dict((re.escape(k), v) for k, v in rep.items())
#Python 3 renamed dict.iteritems to dict.items so use rep.items() for latest versions
pattern = re.compile("|".join(rep.keys()))
# Tokenize all of the sentences and map the tokens to thier word IDs.
def tokenize_text(sent):
input_ids = []
attention_masks = []
encoded_dict = tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
truncation=True,
max_length = 256, # Pad & truncate all sentences.
pad_to_max_length = True,
#padding='longest',
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
# Add the encoded sentence to the list.
input_ids.append(encoded_dict['input_ids'])
# And its attention mask (simply differentiates padding from non-padding).
attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
return input_ids, attention_masks
def create_dataloader(text):
input_ids, attention_masks = tokenize_text(text)
dataset = TensorDataset(input_ids, attention_masks)
batch_size = 1
app_dataloader = DataLoader(
dataset, # The validation samples.
batch_size = batch_size # Evaluate with this batch size.
)
return app_dataloader
def predict(dataloader):
# Prediction on test set
device = torch.device('cpu')
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions = []
# Predict
for batch in dataloader:
# Add batch to CPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
# Store predictions and true labels
predictions.append(logits)
return np.argmax(predictions)
PRE_TRAINED_MODEL_NAME = 'bert-base-german-cased'
# Load the BERT tokenizer
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', PRE_TRAINED_MODEL_NAME) # Download vocabulary from S3 and cache.
n_classes=5
model = BertForSequenceClassification.from_pretrained(
"bert-base-german-cased", # Use the 12-layer BERT model, with an uncased vocab.
num_labels = n_classes, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
p = Path.cwd()
Even though we trained the model on GPU, that's not what we want for production. So I load my model onto CPU.
device = torch.device('cpu')
model.load_state_dict(torch.load(p/'export.pkl', map_location=device))
btn_upload = widgets.FileUpload()
out_pl = widgets.Output()
rating_widget = widgets.Label()
btn_run = widgets.Button(description='Lasses Empfehlung:')
def on_click_text(change):
img = PILImage.create(btn_upload.data[-1])
out_pl.clear_output()
with out_pl: display(proc_img(img).to_thumb(256,256))
text = use_pattern(get_text(proc_img(img)))
star_rating = predict(create_dataloader(text))
rating_widget.value = f'Lasse würde diesem Buch {star_rating+1} Stern(e) von 5 Sternen geben!'
btn_run.on_click(on_click_text)
VBox([widgets.Label('Upload Bild von Buchseite'),
btn_upload, btn_run, out_pl, rating_widget])
Perfect, that worked like a charme! Coming up I will show you how to take this notebook and turn it into a little web app. So stay tuned!
Lasse