|
6 | 6 |
|
7 | 7 | from ..functional import (
|
8 | 8 | add,
|
| 9 | + conv1d, |
9 | 10 | cross_entropy,
|
10 | 11 | dropout,
|
11 | 12 | embedding,
|
@@ -277,6 +278,92 @@ def test_linear_readout() -> None:
|
277 | 278 | assert_scale(output, target=2**-5) # 1/sqrt(fan_in)
|
278 | 279 |
|
279 | 280 |
|
| 281 | +# --- test conv1d() --- |
| 282 | + |
| 283 | + |
| 284 | +def test_conv1d() -> None: |
| 285 | + batch_size = 2**6 |
| 286 | + d_in = 2**6 * 3 |
| 287 | + d_out = 2**6 * 5 |
| 288 | + kernel_size = 11 |
| 289 | + seq_len = 2**6 * 7 |
| 290 | + input = randn(batch_size, d_in, seq_len, requires_grad=True) |
| 291 | + weight = randn(d_out, d_in, kernel_size, requires_grad=True) |
| 292 | + bias = zeros(d_out).requires_grad_() |
| 293 | + output = conv1d(input, weight, bias, constraint=None) |
| 294 | + unit_backward(output) |
| 295 | + |
| 296 | + assert_unit_scaled(output, input.grad, weight.grad, bias.grad) |
| 297 | + |
| 298 | + |
| 299 | +def test_conv1d_stride() -> None: |
| 300 | + batch_size = 2**6 |
| 301 | + d_in = 2**6 * 3 |
| 302 | + d_out = 2**6 * 5 |
| 303 | + kernel_size = 11 |
| 304 | + seq_len = 2**6 * 7 |
| 305 | + stride = 3 |
| 306 | + |
| 307 | + input = randn(batch_size, d_in, seq_len, requires_grad=True) |
| 308 | + weight = randn(d_out, d_in, kernel_size, requires_grad=True) |
| 309 | + bias = zeros(d_out).requires_grad_() |
| 310 | + output = conv1d(input, weight, bias, stride=stride, constraint=None) |
| 311 | + unit_backward(output) |
| 312 | + |
| 313 | + assert_unit_scaled(output, input.grad, weight.grad, bias.grad) |
| 314 | + |
| 315 | + |
| 316 | +def test_conv1d_padding() -> None: |
| 317 | + batch_size = 2**6 |
| 318 | + d_in = 2**6 * 3 |
| 319 | + d_out = 2**6 * 5 |
| 320 | + kernel_size = 11 |
| 321 | + seq_len = 2**6 * 7 |
| 322 | + padding = 23 # If this is large enough wrt seq_len, test fails |
| 323 | + |
| 324 | + input = randn(batch_size, d_in, seq_len, requires_grad=True) |
| 325 | + weight = randn(d_out, d_in, kernel_size, requires_grad=True) |
| 326 | + bias = zeros(d_out).requires_grad_() |
| 327 | + output = conv1d(input, weight, bias, padding=padding, constraint=None) |
| 328 | + unit_backward(output) |
| 329 | + |
| 330 | + assert_unit_scaled(output, input.grad, weight.grad, bias.grad) |
| 331 | + |
| 332 | + |
| 333 | +def test_conv1d_dilation() -> None: |
| 334 | + batch_size = 2**6 |
| 335 | + d_in = 2**6 * 3 |
| 336 | + d_out = 2**6 * 5 |
| 337 | + kernel_size = 11 |
| 338 | + seq_len = 2**6 * 7 |
| 339 | + dilation = 8 |
| 340 | + |
| 341 | + input = randn(batch_size, d_in, seq_len, requires_grad=True) |
| 342 | + weight = randn(d_out, d_in, kernel_size, requires_grad=True) |
| 343 | + bias = zeros(d_out).requires_grad_() |
| 344 | + output = conv1d(input, weight, bias, dilation=dilation, constraint=None) |
| 345 | + unit_backward(output) |
| 346 | + |
| 347 | + assert_unit_scaled(output, input.grad, weight.grad, bias.grad) |
| 348 | + |
| 349 | + |
| 350 | +def test_conv1d_groups() -> None: |
| 351 | + batch_size = 2**6 |
| 352 | + d_in = 2**6 * 3 |
| 353 | + d_out = 2**6 * 5 |
| 354 | + kernel_size = 11 |
| 355 | + seq_len = 2**6 * 7 |
| 356 | + groups = 32 |
| 357 | + |
| 358 | + input = randn(batch_size, d_in, seq_len, requires_grad=True) |
| 359 | + weight = randn(d_out, d_in // groups, kernel_size, requires_grad=True) |
| 360 | + bias = zeros(d_out).requires_grad_() |
| 361 | + output = conv1d(input, weight, bias, groups=groups, constraint=None) |
| 362 | + unit_backward(output) |
| 363 | + |
| 364 | + assert_unit_scaled(output, input.grad, weight.grad, bias.grad) |
| 365 | + |
| 366 | + |
280 | 367 | # --- test layer_norm() ---
|
281 | 368 |
|
282 | 369 |
|
|
0 commit comments