Skip to content

Commit 08a559e

Browse files
committed
added adagrad and unit test
1 parent 97c1946 commit 08a559e

2 files changed

Lines changed: 511 additions & 0 deletions

File tree

Lines changed: 378 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,378 @@
1+
package edu.illinois.cs.cogcomp.lbjava.learn;
2+
3+
import edu.illinois.cs.cogcomp.lbjava.classify.Feature;
4+
import edu.illinois.cs.cogcomp.lbjava.classify.FeatureVector;
5+
import edu.illinois.cs.cogcomp.lbjava.classify.RealPrimitiveStringFeature;
6+
import edu.illinois.cs.cogcomp.lbjava.classify.ScoreSet;
7+
8+
import java.io.PrintStream;
9+
10+
/**
11+
* AdaGrad - Adaptive Stochastic Gradient Method
12+
*
13+
* AdaGrad alters the update to adapt based on historical information,
14+
* so that frequent occurring features in the gradients get small learning rates
15+
* and infrequent features get higher ones. The learner learns slowly from frequent
16+
* features but "pays attention" to rate but informative features. In practice, this
17+
* means that infrequently occurring features can be learned effectively along side
18+
* more frequently occurring features.
19+
*
20+
* A good reference for literature is:
21+
* Duchi, John, Elad Hazan, and Yoram Singer.
22+
* "Adaptive subgradient methods for online learning and stochastic optimization."
23+
* The Journal of Machine Learning Research 12 (2011): 2121-2159.
24+
* http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
25+
*
26+
* @author Yiming Jiang (yjiang16@illinois.edu)
27+
*/
28+
public class AdaGrad extends Learner{
29+
30+
/* eventual value <code>AdaGrad</code> uses */
31+
protected double learningRateA;
32+
33+
/* eventual loss function <code>AdaGrad</code> uses */
34+
protected String lossFunctionA;
35+
36+
private double[] diagonalVector; // sum of squares of gradients
37+
private double[] weightVector; // hypothesis vector
38+
private double[] gradientVector; // gradient vector
39+
40+
/* default constant learning rate is 0.1 */
41+
public static final double defaultLearningRate = 0.1;
42+
43+
/* default loss function is hinge loss */
44+
public static final String defaultLossFunction = "hinge";
45+
46+
/* boolean flag to initialize internal data structures */
47+
private boolean areVectorsInitialized = false;
48+
49+
/**
50+
* Constructor
51+
*
52+
* The learning rate takes the default value, while the name of the
53+
* classifier gets the empty string.
54+
**/
55+
public AdaGrad() {
56+
this("");
57+
}
58+
59+
/**
60+
* Constructor
61+
*
62+
* Sets the learning rate to the specified value, while the name of the
63+
* classifier gets the empty string.
64+
*
65+
* @param r The desired learning rate value.
66+
**/
67+
public AdaGrad(double r) {
68+
this("", r);
69+
}
70+
71+
/**
72+
* Constructor
73+
*
74+
* Sets all member variables to their associated settings.
75+
*
76+
* @param p The settings of all parameters.
77+
**/
78+
79+
public AdaGrad(Parameters p) {
80+
this("", p);
81+
}
82+
83+
/**
84+
* Constructor
85+
*
86+
* The learning rate takes the default value.
87+
*
88+
* @param n The name of the classifier.
89+
**/
90+
public AdaGrad(String n) {
91+
this(n, defaultLearningRate);
92+
}
93+
94+
/**
95+
* Constructor
96+
*
97+
* Set desired learning rate
98+
*
99+
* @param n The name of the classifier.
100+
* @param r The desired learning rate value.
101+
**/
102+
public AdaGrad(String n, double r) {
103+
super(n);
104+
Parameters p = new Parameters();
105+
p.learningRateP = r;
106+
setParameters(p);
107+
}
108+
109+
/**
110+
* Constructor
111+
*
112+
* Sets all member variables to their associated settings.
113+
*
114+
* @param n The name of the classifier.
115+
* @param p The settings of all parameters.
116+
**/
117+
public AdaGrad(String n, Parameters p) {
118+
super(n);
119+
setParameters(p);
120+
}
121+
122+
/**
123+
* Sets the values of parameters that control the behavior of this learning
124+
* algorithm.
125+
*
126+
* @param p The parameters.
127+
**/
128+
public void setParameters(Parameters p) {
129+
learningRateA = p.learningRateP;
130+
lossFunctionA = p.lossFunctionP;
131+
}
132+
133+
/**
134+
* Getter - get weight vector
135+
* @return weight vector
136+
*/
137+
public double[] getWeightVector() {
138+
return weightVector;
139+
}
140+
141+
/**
142+
* Getter - get loss function
143+
* @return "hinge" or "lms"
144+
*/
145+
public String getLossFunction() {
146+
return lossFunctionA;
147+
}
148+
149+
/**
150+
* Getter - get the constant learning rate
151+
* @return learning rate
152+
*/
153+
public double getConstantLearningRate() {
154+
return learningRateA;
155+
}
156+
157+
/**
158+
* AdaGrad's Learning Function:
159+
* Each row of feature vector + label feed in as arguments;
160+
* Update internal parameters;
161+
*
162+
* Note:
163+
* 1. No bias; No Regularization; are implemented
164+
*
165+
* 2. Loss Function used:
166+
* - Hinge Loss
167+
* Q((x, y), w) = max(0, 1 - y(w * x))
168+
* - Least Mean Square
169+
* Q((x, y), w) = 1/2 * (y - w * x)^2
170+
*
171+
* 3. Notations Explanations:
172+
* * Feature Vector (exampleValues): feature vector parsed from data set
173+
* * Label (labelValue): label parsed from data set
174+
* * Weight Vector (weightVector): weight vector, internal parameter
175+
* * Gradient (gradientVector): gradient vector, internal parameter
176+
* for Hinge loss function, g_t = - y_t x_t
177+
* for LMS loss function, g_t = (w_t * x_t - y_t) x_t
178+
* where t stands for the t_th iteration
179+
* * Diagonal Matrix (diagonalVector): diagonal matrix, internal parameter
180+
* sum of squares of gradients at feature j until time t;
181+
*
182+
* @param exampleFeatures indices for feature vector x
183+
* @param exampleValues values for feature vector x
184+
* @param exampleLabels index for label y
185+
* @param labelValues value for label y
186+
*/
187+
@Override
188+
public void learn(int[] exampleFeatures, double[] exampleValues,
189+
int[] exampleLabels, double[] labelValues) {
190+
191+
/* add an additional dimension to feature dimension on W to reduce computation complexities */
192+
int featureDimension = exampleFeatures.length + 1;
193+
194+
if (!areVectorsInitialized) {
195+
initializeVectors(featureDimension);
196+
areVectorsInitialized = true;
197+
}
198+
199+
double labelValue = labelValues[0];
200+
201+
/* hinge loss function */
202+
203+
/* compute (w * x + theta) */
204+
double wDotProductX = 0.0;
205+
for (int i = 0; i < featureDimension-1; i++) {
206+
wDotProductX += weightVector[i] * exampleValues[i];
207+
}
208+
wDotProductX += weightVector[featureDimension-1];
209+
210+
/*
211+
check if a mistake is made
212+
213+
if y(w * x + theta) > 1, no mistake, g = 0
214+
otherwise, made a mistake, g = -y*x
215+
note: for the first n features, the gradient is -yx, for theta, it is -y
216+
*/
217+
boolean didMakeAMistake = true;
218+
219+
if (wDotProductX * labelValue > 1) {
220+
didMakeAMistake = false;
221+
}
222+
223+
/* compute gradient vector */
224+
for (int i = 0; i < featureDimension-1; i++) {
225+
if (didMakeAMistake) {
226+
gradientVector[i] = (-1) * labelValue * exampleValues[i];
227+
}
228+
else {
229+
gradientVector[i] = 0;
230+
}
231+
}
232+
if (didMakeAMistake) {
233+
gradientVector[featureDimension-1] = (-1) * labelValue;
234+
}
235+
else {
236+
gradientVector[featureDimension-1] = 0;
237+
}
238+
239+
/* compute diagonal vector, aka squares of gradient vector */
240+
for (int i = 0; i < featureDimension; i++) {
241+
242+
/* compute G_t = sum from 1 to t (g_t ^2) */
243+
diagonalVector[i] = diagonalVector[i] + (gradientVector[i] * gradientVector[i]);
244+
245+
double denominator = Math.sqrt(diagonalVector[i]);
246+
if (denominator == 0) {
247+
denominator = Math.pow(10, -100); // avoid denominator being 0
248+
}
249+
250+
/* update weight vector */
251+
if (didMakeAMistake) {
252+
/* w_(t+1) = w_t - g_t * r/(G_t)^(1/2) */
253+
weightVector[i] = weightVector[i] -
254+
(gradientVector[i] * learningRateA / denominator);
255+
}
256+
}
257+
}
258+
259+
/**
260+
* Initialize internal parameters vector
261+
* @param size feature dimension
262+
*/
263+
private void initializeVectors(int size) {
264+
diagonalVector = new double[size];
265+
weightVector = new double[size];
266+
gradientVector = new double[size];
267+
for (int i = 0; i < size; i++) {
268+
diagonalVector[i] = 0;
269+
weightVector[i] = 0;
270+
gradientVector[i] = 0;
271+
}
272+
}
273+
274+
/**
275+
* Simply computes the dot product of the weight vector and the example
276+
*
277+
* @param exampleFeatures The example's array of feature indices.
278+
* @param exampleValues The example's array of feature values.
279+
* @return The computed real value.
280+
**/
281+
@Override
282+
public double realValue(int[] exampleFeatures, double[] exampleValues) {
283+
double weightDotProductX = 0.0;
284+
for(int i = 0; i < exampleFeatures.length; i++) {
285+
weightDotProductX += weightVector[i] * exampleValues[i];
286+
}
287+
return weightDotProductX;
288+
}
289+
290+
/**
291+
* Returns the classification of the given example as a single feature
292+
* instead of a {@link FeatureVector}.
293+
*
294+
* @param f The features array.
295+
* @param v The values array.
296+
* @return The classification of the example as a feature.
297+
**/
298+
@Override
299+
public Feature featureValue(int[] f, double[] v) {
300+
return
301+
new RealPrimitiveStringFeature(containingPackage, name, "",
302+
realValue(f, v));
303+
}
304+
305+
/**
306+
* Simply computes the dot product of the weight vector and the feature
307+
* vector extracted from the example object.
308+
*
309+
* @param exampleFeatures The example's array of feature indices.
310+
* @param exampleValues The example's array of feature values.
311+
* @return The computed feature (in a vector).
312+
**/
313+
@Override
314+
public FeatureVector classify(int[] exampleFeatures, double[] exampleValues) {
315+
return null;
316+
}
317+
318+
/**
319+
* Produces a set of scores indicating the degree to which each possible
320+
* discrete classification value is associated with the given example
321+
* object. Learners that return a <code>real</code> feature or more than
322+
* one feature may implement this method by simply returning
323+
* <code>null</code>.
324+
*
325+
* @param exampleFeatures The example's array of feature indices
326+
* @param exampleValues The example's array of values
327+
* @return A set of scores indicating the degree to which each possible
328+
* discrete classification value is associated with the given
329+
* example object.
330+
**/
331+
@Override
332+
public ScoreSet scores(int[] exampleFeatures, double[] exampleValues) {
333+
return null;
334+
}
335+
336+
/**
337+
* Writes the learned function's internal representation as text.
338+
*
339+
* @param printStream The output stream.
340+
**/
341+
@Override
342+
public void write(PrintStream printStream) {
343+
344+
}
345+
346+
/**
347+
* Returns a string describing the output feature type of this classifier.
348+
*
349+
* @return <code>"real"</code>
350+
**/
351+
@Override
352+
public String getOutputType() {
353+
return "real";
354+
}
355+
356+
/**
357+
* A container for all of <code>AdaGrad</code>'s configurable
358+
* parameters. Using instances of this class should make code
359+
* more readable and constructors less complicated.
360+
*
361+
* @author Yiming Jiang
362+
*/
363+
public static class Parameters extends Learner.Parameters {
364+
/* the rate at which weights are updated */
365+
public double learningRateP;
366+
public String lossFunctionP; // "hinge" or "lms"
367+
368+
/**
369+
* Constructor for <code>Parameters</code> class
370+
*
371+
* use defaultLearningRate if not specified
372+
*/
373+
public Parameters() {
374+
learningRateP = defaultLearningRate;
375+
lossFunctionP = defaultLossFunction;
376+
}
377+
}
378+
}

0 commit comments

Comments
 (0)