forked from SciSharp/TensorFlow.NET
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathImdb.cs
More file actions
117 lines (103 loc) · 4.07 KB
/
Imdb.cs
File metadata and controls
117 lines (103 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Tensorflow.Keras.Utils;
using Tensorflow.NumPy;
using System.Linq;
namespace Tensorflow.Keras.Datasets
{
/// <summary>
/// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
/// (positive/negative). Reviews have been preprocessed, and each review is
/// encoded as a list of word indexes(integers).
/// </summary>
public class Imdb
{
string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/";
string file_name = "imdb.npz";
string dest_folder = "imdb";
/// <summary>
/// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
/// </summary>
/// <param name="path"></param>
/// <param name="num_words"></param>
/// <param name="skip_top"></param>
/// <param name="maxlen"></param>
/// <param name="seed"></param>
/// <param name="start_char"></param>
/// <param name="oov_char"></param>
/// <param name="index_from"></param>
/// <returns></returns>
public DatasetPass load_data(string path = "imdb.npz",
int num_words = -1,
int skip_top = 0,
int maxlen = -1,
int seed = 113,
int start_char = 1,
int oov_char= 2,
int index_from = 3)
{
if (maxlen == -1) throw new InvalidArgumentError("maxlen must be assigned.");
var dst = Download();
var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt"));
var x_train_string = new string[lines.Length];
var y_train = np.zeros(new int[] { lines.Length }, np.int64);
for (int i = 0; i < lines.Length; i++)
{
y_train[i] = long.Parse(lines[i].Substring(0, 1));
x_train_string[i] = lines[i].Substring(2);
}
var x_train = keras.preprocessing.sequence.pad_sequences(PraseData(x_train_string), maxlen: maxlen);
File.ReadAllLines(Path.Combine(dst, "imdb_test.txt"));
var x_test_string = new string[lines.Length];
var y_test = np.zeros(new int[] { lines.Length }, np.int64);
for (int i = 0; i < lines.Length; i++)
{
y_test[i] = long.Parse(lines[i].Substring(0, 1));
x_test_string[i] = lines[i].Substring(2);
}
var x_test = keras.preprocessing.sequence.pad_sequences(PraseData(x_test_string), maxlen: maxlen);
return new DatasetPass
{
Train = (x_train, y_train),
Test = (x_test, y_test)
};
}
(NDArray, NDArray) LoadX(byte[] bytes)
{
var y = np.Load_Npz<byte[]>(bytes);
return (y["x_train.npy"], y["x_test.npy"]);
}
(NDArray, NDArray) LoadY(byte[] bytes)
{
var y = np.Load_Npz<long[]>(bytes);
return (y["y_train.npy"], y["y_test.npy"]);
}
string Download()
{
var dst = Path.Combine(Path.GetTempPath(), dest_folder);
Directory.CreateDirectory(dst);
Web.Download(origin_folder + file_name, dst, file_name);
return dst;
// return Path.Combine(dst, file_name);
}
protected IEnumerable<int[]> PraseData(string[] x)
{
var data_list = new List<int[]>();
for (int i = 0; i < len(x); i++)
{
var list_string = x[i];
var cleaned_list_string = list_string.Replace("[", "").Replace("]", "").Replace(" ", "");
string[] number_strings = cleaned_list_string.Split(',');
int[] numbers = new int[number_strings.Length];
for (int j = 0; j < number_strings.Length; j++)
{
numbers[j] = int.Parse(number_strings[j]);
}
data_list.Add(numbers);
}
return data_list;
}
}
}