Initial commit

2026-02-21 19:27:58 +00:00 · 2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions
--- a/ML/TensorFlow/Basics/tutorial19-customdata-text/tutorial19-customdata-text.py
+++ b/ML/TensorFlow/Basics/tutorial19-customdata-text/tutorial19-customdata-text.py
@@ -0,0 +1,170 @@
+import os
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+import tensorflow as tf
+import pandas as pd
+import tensorflow_datasets as tfds
+from tensorflow import keras
+from tensorflow.keras import layers
+import pickle
+
+tokenizer = tfds.features.text.Tokenizer()
+
+english = tf.data.TextLineDataset("english.csv")
+swedish = tf.data.TextLineDataset("swedish.csv")
+dataset = tf.data.Dataset.zip((english, swedish))
+
+for eng, swe in dataset.skip(1):
+    print(tokenizer.tokenize(eng.numpy()))
+    print(tokenizer.tokenize(swe.numpy().decode("UTF-8")))
+
+# TODO:
+# 1. vocabulary (for each language)
+# 2. tokenize and numericalize words
+# 3. padded_batch, create model
+
+
+import sys
+
+sys.exit()
+
+
+## Example if you have multiple files
+file_names = ["test_example1.csv", "test_example2.csv", "test_example3.csv"]
+dataset = tf.data.TextLineDataset(file_names)
+
+dataset1 = tf.data.TextLineDataset("test_example1.csv").skip(1)  # .map(preprocess1)
+dataset2 = tf.data.TextLineDataset("test_example2.csv").skip(1)  # .map(preprocess1)
+dataset3 = tf.data.TextLineDataset("test_example3.csv").skip(1)  # .map(preprocess1)
+
+dataset = dataset1.concatenate(dataset2).concatenate(dataset3)
+
+for line in dataset:
+    print(line)
+
+
+import sys
+
+sys.exit()
+
+
+def filter_train(line):
+    split_line = tf.strings.split(line, ",", maxsplit=4)
+    dataset_belonging = split_line[1]  # train, test
+    sentiment_category = split_line[2]  # pos, neg, unsup
+
+    return (
+        True
+        if dataset_belonging == "train" and sentiment_category != "unsup"
+        else False
+    )
+
+
+def filter_test(line):
+    split_line = tf.strings.split(line, ",", maxsplit=4)
+    dataset_belonging = split_line[1]  # train, test
+    sentiment_category = split_line[2]  # pos, neg, unsup
+
+    return (
+        True if dataset_belonging == "test" and sentiment_category != "unsup" else False
+    )
+
+
+ds_train = tf.data.TextLineDataset("imdb.csv").filter(filter_train)
+ds_test = tf.data.TextLineDataset("imdb.csv").filter(filter_test)
+
+# TODO:
+# 1. Create vocabulary
+# 2. Numericalize text str -> indices (TokenTextEncoder)
+# 3. Pad the batches so we can send in to an RNN for example
+
+tokenizer = tfds.features.text.Tokenizer()
+# 'i love banana' -> ['i', 'love', 'banana'] -> [0, 1, 2]
+
+
+def build_vocabulary(ds_train, threshold=200):
+    """ Build a vocabulary """
+    frequencies = {}
+    vocabulary = set()
+    vocabulary.update(["sostoken"])
+    vocabulary.update(["eostoken"])
+
+    for line in ds_train.skip(1):
+        split_line = tf.strings.split(line, ",", maxsplit=4)
+        review = split_line[4]
+        tokenized_text = tokenizer.tokenize(review.numpy().lower())
+
+        for word in tokenized_text:
+            if word not in frequencies:
+                frequencies[word] = 1
+
+            else:
+                frequencies[word] += 1
+
+            # if we've reached the threshold
+            if frequencies[word] == threshold:
+                vocabulary.update(tokenized_text)
+
+    return vocabulary
+
+
+# Build vocabulary and save it to vocabulary.obj
+vocabulary = build_vocabulary(ds_train)
+vocab_file = open("vocabulary.obj", "wb")
+pickle.dump(vocabulary, vocab_file)
+
+# Loading the vocabulary
+# vocab_file = open("vocabulary.obj", "rb")
+# vocabulary = pickle.load(vocab_file)
+
+encoder = tfds.features.text.TokenTextEncoder(
+    list(vocabulary), oov_token="<UNK>", lowercase=True, tokenizer=tokenizer,
+)
+
+
+def my_encoder(text_tensor, label):
+    encoded_text = encoder.encode(text_tensor.numpy())
+    return encoded_text, label
+
+
+def encode_map_fn(line):
+    split_line = tf.strings.split(line, ",", maxsplit=4)
+    label_str = split_line[2]  # neg, pos
+    review = "sostoken " + split_line[4] + " eostoken"
+    label = 1 if label_str == "pos" else 0
+
+    (encoded_text, label) = tf.py_function(
+        my_encoder, inp=[review, label], Tout=(tf.int64, tf.int32),
+    )
+
+    encoded_text.set_shape([None])
+    label.set_shape([])
+    return encoded_text, label
+
+
+AUTOTUNE = tf.data.experimental.AUTOTUNE
+ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE).cache()
+ds_train = ds_train.shuffle(25000)
+ds_train = ds_train.padded_batch(32, padded_shapes=([None], ()))
+
+ds_test = ds_test.map(encode_map_fn)
+ds_test = ds_test.padded_batch(32, padded_shapes=([None], ()))
+
+model = keras.Sequential(
+    [
+        layers.Masking(mask_value=0),
+        layers.Embedding(input_dim=len(vocabulary) + 2, output_dim=32,),
+        layers.GlobalAveragePooling1D(),
+        layers.Dense(64, activation="relu"),
+        layers.Dense(1),
+    ]
+)
+
+model.compile(
+    loss=keras.losses.BinaryCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.Adam(3e-4, clipnorm=1),
+    metrics=["accuracy"],
+)
+
+model.fit(ds_train, epochs=15, verbose=2)
+model.evaluate(ds_test)