質問編集履歴

1

コードを追記しました。文字数の関係で省略してある点があります。申し訳ありません。

2021/01/25 08:01

投稿

tkrd
tkrd

スコア5

test CHANGED
File without changes
test CHANGED
@@ -25,3 +25,491 @@
25
25
 
26
26
 
27
27
  何かお分かりになる方がいましたら教えていただけますと幸いです。
28
+
29
+
30
+
31
+ 以下ソースコードになります。実現したいことは、生物のDNAの塩基配列情報から4連続の文字列がどれだけ含まれているかという頻度データをcsvファイルに出力することです。
32
+
33
+
34
+
35
+ ```Ruby
36
+
37
+ コード
38
+
39
+ ### File-name: split_sequence_4serial_inference2.rb ###
40
+
41
+
42
+
43
+ require 'csv'
44
+
45
+
46
+
47
+ j = 0
48
+
49
+ sum = 12 / 2
50
+
51
+ not_print = []
52
+
53
+ count = 0
54
+
55
+ label_count = 0
56
+
57
+ string_count = 0
58
+
59
+
60
+
61
+ File.open('output.data', 'w'){|out_file_name|
62
+
63
+
64
+
65
+ text = File.read('test2.csv')
66
+
67
+
68
+
69
+ text_1 = text.gsub(">", "")
70
+
71
+ text_1 = text_1.gsub(" ", "")
72
+
73
+ text_1 = text_1.gsub(/@.{36}/, "")
74
+
75
+
76
+
77
+ out_file_name.print(text_1)
78
+
79
+ }
80
+
81
+
82
+
83
+ File.open('output.data'){|file|
84
+
85
+
86
+
87
+ file.each_line{|line|
88
+
89
+
90
+
91
+ if (count%2==0)#((2<line.length)&&(line.length<20)) then
92
+
93
+ print count
94
+
95
+ print "\n"
96
+
97
+ label[label_count] = line
98
+
99
+ #label[label_count].chomp!
100
+
101
+ count += 1
102
+
103
+ label_count += 1
104
+
105
+ string_count += (line.length)
106
+
107
+ #print string_count
108
+
109
+ #print "\n"
110
+
111
+ elsif (count%2==1)#(line.length>20)
112
+
113
+ split_length = line.length
114
+
115
+ #print split_length
116
+
117
+ t_t_t_t = 0
118
+
119
+ t_t_t_c = 0
120
+
121
+ t_t_t_a = 0
122
+
123
+ ... #10000文字までしかコードを載せられないため省略
124
+
125
+ g_g_g_t = 0
126
+
127
+ g_g_g_c = 0
128
+
129
+ g_g_g_a = 0
130
+
131
+ g_g_g_g = 0
132
+
133
+ others = 0
134
+
135
+ i = 0
136
+
137
+
138
+
139
+ while i < split_length do
140
+
141
+ file.seek(string_count+i, IO::SEEK_SET)
142
+
143
+ dna_w = file.read(1)
144
+
145
+ #print dna_w
146
+
147
+ file.seek(string_count+i+1, IO::SEEK_SET)
148
+
149
+ dna_w2 = file.read(1)
150
+
151
+ #print dna_w2
152
+
153
+ file.seek(string_count+i+2, IO::SEEK_SET)
154
+
155
+ dna_w3 = file.read(1)
156
+
157
+ #print dna_w3
158
+
159
+ file.seek(string_count+i+3, IO::SEEK_SET)
160
+
161
+ dna_w4 = file.read(1)
162
+
163
+ #print dna_w4
164
+
165
+ #print j
166
+
167
+ case dna_w
168
+
169
+
170
+
171
+ #4連続塩基の頻度を算出する
172
+
173
+ when 'T'
174
+
175
+
176
+
177
+ if(dna_w2 == 'T') then
178
+
179
+ if(dna_w3 == 'T') then
180
+
181
+ if(dna_w4 == 'T') then
182
+
183
+ t_t_t_t += 1
184
+
185
+ elsif(dna_w4 == 'C') then
186
+
187
+ t_t_t_c += 1
188
+
189
+ elsif(dna_w4 == 'A') then
190
+
191
+ t_t_t_a += 1
192
+
193
+ elsif(dna_w4 == 'G') then
194
+
195
+ t_t_t_g += 1
196
+
197
+ end
198
+
199
+ elsif(dna_w3 == 'C') then
200
+
201
+ if(dna_w4 == 'T') then
202
+
203
+ t_t_c_t += 1
204
+
205
+ elsif(dna_w4 == 'C') then
206
+
207
+ t_t_c_c += 1
208
+
209
+ elsif(dna_w4 == 'A') then
210
+
211
+ t_t_c_a += 1
212
+
213
+ elsif(dna_w4 == 'G') then
214
+
215
+ t_t_c_g += 1
216
+
217
+ end
218
+
219
+ elsif(dna_w3 == 'A') then
220
+
221
+ if(dna_w4 == 'T') then
222
+
223
+ t_t_a_t += 1
224
+
225
+ elsif(dna_w4 == 'C') then
226
+
227
+ t_t_a_c += 1
228
+
229
+ elsif(dna_w4 == 'A') then
230
+
231
+ t_t_a_a += 1
232
+
233
+ elsif(dna_w4 == 'G') then
234
+
235
+ t_t_a_g += 1
236
+
237
+ end
238
+
239
+ elsif(dna_w3 == 'G') then
240
+
241
+ if(dna_w4 == 'T') then
242
+
243
+ t_t_g_t += 1
244
+
245
+ elsif(dna_w4 == 'C') then
246
+
247
+ t_t_g_c += 1
248
+
249
+ elsif(dna_w4 == 'A') then
250
+
251
+ t_t_g_a += 1
252
+
253
+ elsif(dna_w4 == 'G') then
254
+
255
+ t_t_g_g += 1
256
+
257
+ end
258
+
259
+ end
260
+
261
+ elsif(dna_w2 == 'C') then
262
+
263
+ if(dna_w3 == 'T') then
264
+
265
+ if(dna_w4 == 'T') then
266
+
267
+ t_c_t_t += 1
268
+
269
+ elsif(dna_w4 == 'C') then
270
+
271
+ t_c_t_c += 1
272
+
273
+ elsif(dna_w4 == 'A') then
274
+
275
+ t_c_t_a += 1
276
+
277
+ elsif(dna_w4 == 'G') then
278
+
279
+ t_c_t_g += 1
280
+
281
+ end
282
+
283
+ elsif(dna_w3 == 'C') then
284
+
285
+ if(dna_w4 == 'T') then
286
+
287
+ t_c_c_t += 1
288
+
289
+ elsif(dna_w4 == 'C') then
290
+
291
+ t_c_c_c += 1
292
+
293
+ elsif(dna_w4 == 'A') then
294
+
295
+ t_c_c_a += 1
296
+
297
+ elsif(dna_w4 == 'G') then
298
+
299
+ t_c_c_g += 1
300
+
301
+ end
302
+
303
+ elsif(dna_w3 == 'A') then
304
+
305
+ if(dna_w4 == 'T') then
306
+
307
+ t_c_a_t += 1
308
+
309
+ elsif(dna_w4 == 'C') then
310
+
311
+ t_c_a_c += 1
312
+
313
+ elsif(dna_w4 == 'A') then
314
+
315
+ t_c_a_a += 1
316
+
317
+ elsif(dna_w4 == 'G') then
318
+
319
+ t_c_a_g += 1
320
+
321
+ end
322
+
323
+ elsif(dna_w3 == 'G') then
324
+
325
+ if(dna_w4 == 'T') then
326
+
327
+ t_c_g_t += 1
328
+
329
+ elsif(dna_w4 == 'C') then
330
+
331
+ t_c_g_c += 1
332
+
333
+ elsif(dna_w4 == 'A') then
334
+
335
+ t_c_g_a += 1
336
+
337
+ elsif(dna_w4 == 'G') then
338
+
339
+ t_c_g_g += 1
340
+
341
+ end
342
+
343
+ end
344
+
345
+ ... #10000文字までしかコードを載せられないため省略
346
+
347
+ end
348
+
349
+ end
350
+
351
+ end
352
+
353
+
354
+
355
+ when 'C' #10000文字までしかコードが載せられないため省略
356
+
357
+ when 'A'
358
+
359
+ when 'G'
360
+
361
+ else
362
+
363
+ others += 1
364
+
365
+ end
366
+
367
+ i += 1
368
+
369
+ end
370
+
371
+
372
+
373
+ string_count += (line.length)
374
+
375
+ if((others == 0) || (others < (split_length * 5))) then
376
+
377
+ tttt[j] = t_t_t_t
378
+
379
+ tttc[j] = t_t_t_c
380
+
381
+ ttta[j] = t_t_t_a
382
+
383
+ ... #10000文字までしかコードを載せられないため省略
384
+
385
+ gggt[j] = g_g_g_t
386
+
387
+ gggc[j] = g_g_g_c
388
+
389
+ ggga[j] = g_g_g_a
390
+
391
+ gggg[j] = g_g_g_g
392
+
393
+
394
+
395
+ #print string_count
396
+
397
+ #print "\n"
398
+
399
+ #print count
400
+
401
+ count += 1
402
+
403
+ j += 1
404
+
405
+
406
+
407
+ else
408
+
409
+ #print j
410
+
411
+ #print "\n"
412
+
413
+ not_print[j] = 2
414
+
415
+ j += 1
416
+
417
+ end
418
+
419
+ end
420
+
421
+ }
422
+
423
+ }
424
+
425
+
426
+
427
+
428
+
429
+ #csvファイルへの出力
430
+
431
+ file = File.open('output.csv', 'w')
432
+
433
+
434
+
435
+ c = ","
436
+
437
+ k = "\n"
438
+
439
+
440
+
441
+ file.print 'label'
442
+
443
+ file.print c
444
+
445
+ file.print 'tttt'
446
+
447
+ file.print c
448
+
449
+ file.print 'tttc'
450
+
451
+ file.print c
452
+
453
+ file.print 'ttta'
454
+
455
+ file.print c
456
+
457
+ ... #10000文字までしかコードを載せられないため省略
458
+
459
+ file.print 'gggt'
460
+
461
+ file.print c
462
+
463
+ file.print 'gggc'
464
+
465
+ file.print c
466
+
467
+ file.print 'ggga'
468
+
469
+ file.print c
470
+
471
+ file.print 'gggg'
472
+
473
+ file.print k
474
+
475
+
476
+
477
+ j = 0
478
+
479
+ while j < sum do
480
+
481
+ label[j].tap(&:strip!) #eroor箇所
482
+
483
+ file.print label[j]
484
+
485
+ file.print c
486
+
487
+ file.print tttt[j]
488
+
489
+ file.print c
490
+
491
+ file.print tttc[j]
492
+
493
+ file.print c
494
+
495
+ file.print ttta[j]
496
+
497
+ file.print c
498
+
499
+ ... #10000文字までしかコードを載せられないため省略
500
+
501
+ file.print ggga[j]
502
+
503
+ file.print c
504
+
505
+ file.print gggg[j]
506
+
507
+ file.print k
508
+
509
+
510
+
511
+ j = j + 1
512
+
513
+ end
514
+
515
+ ```