# # Copyright (C) 2020 Alessandro Languasco # import pandas as pd import numpy as np # functions needed to create the normalised values for histograms def f(x): y = np.log(x) return y # open output fileanalysis fileanalysis= open("analysis.txt","w") data=pd.read_csv(r'rq-total.csv', sep=';',dtype='str') data.head(3) df = pd.DataFrame(data, columns= ['q','rq','vq']) df = df.astype('float128') #print (df) # create a new column with log values df['logrq'] = f(df['rq']) #FINDING MAX AND MIN and where they are attained #M=df['rq'].max() #idxM=df['rq'].idxmax() #argM=df['q'].iloc[idxM] #m=df['rq'].min() #idxm=df['rq'].idxmin() #argm=df['q'].iloc[idxm] #print('minimal value on column r(q) =',m,'attained at q =',argm) #print('maximal value on column r(q) =',M,'attained at q =',argM) #FINDING MAX and second MAX; MIN and second MIN; and where they are attained print(' ***** Statistics for r(q) *****') print('name source fileanalysis = rq-total.csv') fileanalysis.write(" ***** Statistics for r(q) ***** ") fileanalysis.write("\n") fileanalysis.write("name source fileanalysis =rq-total.csv") fileanalysis.write("\n") print(' ***** MAX *****') maxima =df['rq'].nlargest(2) M=maxima.iloc[0] idxM=maxima.idxmax() argM=df['q'].iloc[idxM] M1=maxima.iloc[1] idxM1=maxima.idxmin() argM1=df['q'].iloc[idxM1] print('maximal value on column r(q) =',M,'attained at q =',argM) print('second maximal value on column r(q) =',M1,'attained at q =',argM1) fileanalysis.write(" ***** MAX *****\n") fileanalysis.write("maximal value on column r(q) = ") fileanalysis.write(str(M)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argM)) fileanalysis.write("\n") fileanalysis.write("second maximal value on column r(q) = ") fileanalysis.write(str(M1)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argM1)) fileanalysis.write("\n") print(' ***** MIN *****') minima =df['rq'].nsmallest(2) m=minima.iloc[0] idxm=minima.idxmin() argm=df['q'].iloc[idxm] m1=minima.iloc[1] idxm1=minima.idxmax() argm1=df['q'].iloc[idxm1] print('minimal value on column r(q) =',m,'attained at q =',argm) print('second minimal value on column r(q) =',m1,'attained at q =',argm1) fileanalysis.write(" ***** MIN *****\n") fileanalysis.write("minimal value on column r(q) = ") fileanalysis.write(str(m)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argm)) fileanalysis.write("\n") fileanalysis.write("second minimal value on column r(q) = ") fileanalysis.write(str(m1)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argm1)) fileanalysis.write("\n") print(' ***** STATS r(q) *****') # number of values >1 for rq total=float(len(df)) greater1 = len(df['rq'].loc[df.rq > 1]) percent=float(greater1)/total print('number of r(q)>1 :', greater1,'percentage =', percent*100) fileanalysis.write(" ***** STATS r(q) *****\n") fileanalysis.write("number of r(q) > 1 : ") fileanalysis.write(str(greater1)) fileanalysis.write(" percentage = ") fileanalysis.write(str(percent*100)) fileanalysis.write("\n") # number of values <1 for rq less1 = len(df['rq'].loc[df.rq < 1]) percent=float(less1)/total print('number of r(q)<1 :',less1,'percentage =', percent*100) fileanalysis.write("number of r(q) < 1 : ") fileanalysis.write(str(less1)) fileanalysis.write(" percentage = ") fileanalysis.write(str(percent*100)) fileanalysis.write("\n") print('total number (less1+greater1) =', less1+greater1) print('total number (number of rows) =', int(total)) fileanalysis.write("total number (less1+greater1) = ") fileanalysis.write(str(less1+greater1)) fileanalysis.write("\n") fileanalysis.write("total number (number of rows) = ") fileanalysis.write(str(int(total))) fileanalysis.write("\n") # number of r(q) > 2 greater2= len(df.loc[df.rq > 2]) if greater2 > 0 : print(' ***** Possible wrong datas *******') print('number of r(q) > 2 :',greater2,'percentage =', percent*100) # number of r(q) <0.5 greater2= len(df.loc[df.rq < 0.5]) if greater2 > 0 : print(' ***** Possible wrong datas *******') print('number of r(q) <0.5 :',greater2,'percentage =', percent*100) print('******* MEAN, STANDARD DEVIATION, VARIANCE ********') #print(df.describe()) mean = df['logrq'].mean() devst = df['logrq'].std() variance = df.var()['logrq'] print('mean (logrq) = ',mean) print('standard deviation (logrq) = ',devst) print('variance (logrq) = ',variance) fileanalysis.write('******* MEAN, STANDARD DEVIATION, VARIANCE ********') fileanalysis.write("\n") fileanalysis.write('mean (logrq) = ') fileanalysis.write(str(mean)) fileanalysis.write("\n") fileanalysis.write('standard deviation (logrq) = ') fileanalysis.write(str(devst)) fileanalysis.write("\n") fileanalysis.write('variance (logrq) = ') fileanalysis.write(str(variance)) fileanalysis.write("\n") print('*******') print('********* plotting histogram and normal curves *********') import matplotlib.pyplot as plt from scipy.stats import norm df = df.astype(float) mean = mean.astype(float) devst = devst.astype(float) #mean, devst = norm.fit(df['logrq']) num_bins = 100 plt.hist(df['logrq'], num_bins, color = "skyblue", histtype='bar', ec='black') # Plot the Probability Density Function. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) bin_width = (x.max() - x.min()) / num_bins mass_correction = total * bin_width print('******** black normal function data ******* ') devst_black = 1.1*devst print('total data ', total) print('max x = ', x.max()) print('min x = ', x.min()) print('bin_width = ', bin_width) print('num bins = ', num_bins) print('mean = ', mean) print('standard deviation = ', devst_black) print('mass correction = ', mass_correction) p = mass_correction * norm.pdf(x, mean, devst_black) #p = 10000*norm.pdf(x, mean, devst) plt.plot(x, p, 'k', linewidth=1) print('******** red normal function data ******* ') #devst_red = 0.775*devst #mass_correction_red = mass_correction*0.775 devst_red = 0.65*devst mass_correction_red = mass_correction*0.83 print('total data ', total) print('max x = ', x.max()) print('min x = ', x.min()) print('bin_width = ', bin_width) print('num bins = ', num_bins) print('mean = ', mean) print('standard deviation = ', devst_red) print('mass correction = ', mass_correction_red) p = mass_correction_red * norm.pdf(x, mean, devst_red) #p = 6500*norm.pdf(x, mean, 0.65*devst) plt.plot(x, p, 'r', linewidth=1) #title = "$\log r(q)$ : $\mu =$%.2f, $\sigma =$%.2f, $\mathcal{M} =$%.2f" % (mean, devst, mass_correction) title = "$\log\ r(q)$: histogram" plt.title(title) #plt.show() caption = "black curve : $\mathcal{M}\cdot \mathcal{N}(x,\mu,1.1\cdot\sigma)$ ; red curve : $0.83\cdot \mathcal{M}\cdot \mathcal{N}(x,\mu,0.65\cdot\sigma)$" plt.xlabel(caption) #plt.ylabel(caption) plt.savefig("../../histograms/histo-logrq.png", dpi=600, bbox_inches='tight') # clear the plot plt.cla() print('********* saved in histo-logrq.png *********') # scatter plot print('********') print('******** Print scatter plot logrq ******* ') #setting dimentions plt.figure(figsize=(11, 8)) # setting ticks million=1000000 # parameter ax = plt.axes() ax.set_xticks([0,million,2*million, 3*million, 4*million, 5*million, 6*million, 7*million, 8*million , 9*million, 10*million]) ax.set_xticklabels(['$0$', '$10^6$','$2\cdot 10^6$','$3\cdot 10^6$','$4\cdot 10^6$','$5\cdot 10^6$','$6\cdot 10^6$','$7\cdot 10^6$','$8\cdot 10^6$','$9\cdot 10^6$','$10^7$']) ax.set_yticks([-0.5,-0.4,-0.3,-0.2, -0.1, 0, 0.1, 0.2,0.3, 0.4, 0.5]) ax.set_yticklabels(['$-0.5$','$-0.4$','$-0.3$','$-0.2$', '$-0.1$', '$0$', '$0.1$', '$0.2$', '$0.3$', '$0.4$', '$0.5$']) # setting bounds plt.xlim(0,10*million) # building the grid - horizontal lines count=6 for n in range(count): ax.axhline(y=-0.1*n, linewidth = 1, color='k', linestyle='dotted') ax.axhline(y=0.1*n, linewidth = 1, color='k', linestyle='dotted') ## red line in y=0 ax.axhline(y=0.0, linewidth = 1, color='r', linestyle='dotted') # building the grid - vertical lines count=10 for n in range(count): ax.axvline(x=million*n, linewidth = 1, color='k', linestyle='dotted') # plotting the scatter plot plt.scatter(df['q'], df['logrq'], s=1, c = 'black', marker = '.') # plotting max and min maxlogrq = df['logrq'].max() minlogrq = df['logrq'].min() plt.plot(argM, maxlogrq, marker='o', markersize=4, color="red") plt.plot(argm, minlogrq, marker='o', markersize=4, color="blue") # inserting a title and a caption title = "$\log\ r(q)$: scatter plot" plt.title(title) caption = "" plt.xlabel(caption) # save plot on file plt.savefig("../../plots/logrq-scat.png", dpi=600, bbox_inches='tight', orientation='landscape') # clear plot plt.cla() print('********* saved in logrq-scat.png *********') # scatter plot print('********') print('******** Print scatter plot rq ******* ') #setting dimentions plt.figure(figsize=(11, 8)) # setting ticks million=1000000 # parameter ax = plt.axes() ax.set_xticks([0,million,2*million, 3*million, 4*million, 5*million, 6*million, 7*million, 8*million , 9*million, 10*million ]) ax.set_xticklabels(['$0$', '$10^6$','$2\cdot 10^6$','$3\cdot 10^6$','$4\cdot 10^6$','$5\cdot 10^6$','$6\cdot 10^6$','$7\cdot 10^6$','$8\cdot 10^6$','$9\cdot 10^6$','$10^7$']) ax.set_yticks([0.6,0.7,0.8,0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7]) ax.set_yticklabels(['$0.6$','$0.7$','$0.8$','$0.9$', '$1$', '$1.1$', '$1.2$', '$1.3$', '$1.4$', '$1.5$', '$1.6$', '$1.7$']) # setting bounds plt.xlim(0,10*million) # building the grid - horizontal lines count=4 for n in range(count): ax.axhline(y=0.6+0.1*n, linewidth = 1, color='k', linestyle='dotted') count=8 for n in range(count): ax.axhline(y=1+0.1*n, linewidth = 1, color='k', linestyle='dotted') ## red line in y=0 ax.axhline(y=1, linewidth = 1, color='r', linestyle='dotted') # building the grid - vertical lines count=10 for n in range(count): ax.axvline(x=million*n, linewidth = 1, color='k', linestyle='dotted') # plotting the scatter plot plt.scatter(df['q'], df['rq'], s=1, c = 'black', marker = '.') # plotting max and min maxrq = df['rq'].max() minrq = df['rq'].min() plt.plot(argM, maxrq, marker='o', markersize=4, color="red") plt.plot(argm, minrq, marker='o', markersize=4, color="blue") # inserting a title and a caption title = "$r(q)$: scatter plot" plt.title(title) caption = "" plt.xlabel(caption) # save plot on file plt.savefig("../../plots/rq-scat.png", dpi=600, bbox_inches='tight', orientation='landscape') # clear plot plt.cla() print('********* saved in rq-scat.png *********') print('*******') print(' ***** End PYTHON script *****') fileanalysis.close() ''' print(' ***** Exported data frame for gnuplot *****') print(' ***** in fileanalysis named: graph-total.txt *****') df.to_csv (r'graph-total.txt', sep='\t', index = False, header=True) print(' ***** Exported data frame for gnuplot *****') print(' ***** in fileanalysis named: lq.dat *****') del df['vq'] # create a new column with normalised values for histograms df['lq'] = (f(df['rq']) ) df.to_csv (r'lq.dat', sep='\t', index = False, header=True) '''